vllm/.buildkite/test-amd.yaml

# In this file, you can add more tests to run either by adding a new step or
# adding a new command to an existing step. See different options here for examples.

# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.

# Documentation
# label(str): the name of the test. emojis allowed.
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
# fast_check_only(bool): run this test on the fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for the test. incompatible with command.
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
#     in this case, commands must be specified. the first command runs on the first host, the second
#     command runs on the second host.
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.

# When adding a test
# - If the test belongs to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
#   Note that all steps execute in parallel.


#####################################################################################################################################
#                                                                                                                                   #
#                                                             README                                                                #
#                                                                                                                                   #
#####################################################################################################################################
#                                                                                                                                   #
# IMPORTANT:                                                                                                                        #
#   * Currently AMD CI has MI250 agents, MI325 agents, and MI355 agents. All upcoming feature improvements are tracked in:          #
#         https://github.com/vllm-project/vllm/issues/34994                                                                         #
#                                                                                                                                   #
#-----------------------------------------------------------------------------------------------------------------------------------#
#                                                                                                                                   #
# NOTES:                                                                                                                            #
#   * [Pytorch Nightly Dependency Override Check]: if this test fails, it means the nightly torch version is not compatible with    #
#                                                  some of the dependencies. Please check the error message and add the package to  #
#                                                  whitelist in `/vllm/tools/pre_commit/generate_nightly_torch_test.py`.            #
#   * [Entrypoints Integration (LLM)]:                                                                                              #
#     - {`pytest -v -s entrypoints/llm/test_generate.py`}: It needs a clean process                                                 #
#     - {`pytest -v -s entrypoints/offline_mode`}: Needs to avoid interference with other tests                                     #
#   * [Engine / Engine (1 GPU) / e2e Scheduling / e2e Core / V1 e2e / Spec Decode / V1 Sample + Logits / V1 Core + KV + Metrics]:   #
#     - Previously a single "V1 Test e2e + engine" step, now split across multiple groups.                                          #
#     - V1 e2e (2/4 GPUs) uses 4 GPUs but is scheduled on 8-GPU machines for stability. See:                                        #
#       https://github.com/vllm-project/vllm/pull/31040                                                                             #
#   * [V1 Sample + Logits / V1 Core + KV + Metrics / V1 others (CPU)]:                                                              #
#     - Previously a single "V1 others" step, now split to avoid interference.                                                      #
#     - Integration test for streaming correctness (requires special branch for __harness__ lib).                                   #
#   * [V1 others (CPU)]: Split the tests to avoid interference                                                                      #
#   * [PyTorch Compilation Unit Tests]: Run unit tests defined directly under `compile/`, not including subdirectories, which       #
#                                       are usually heavier tests covered elsewhere. Use `find` to launch multiple instances        #
#                                       of pytest so that they do not suffer from:                                                  #
#                                       https://github.com/vllm-project/vllm/issues/28965                                           #
#   * [PyTorch Fullgraph Smoke Test]: Run smoke tests under fullgraph directory, except `test_full_graph.py` as it is a heavy       #
#                                     test that is covered in other steps. Use `find` to launch multiple instances of pytest        #
#                                     so that they do not suffer from: https://github.com/vllm-project/vllm/issues/28965            #
#   * [PyTorch Fullgraph]:                                                                                                          #
#     - Limit to no custom ops to reduce running time. Wrap with quotes to escape yaml and avoid starting `-k` string               #
#       with a `-`                                                                                                                  #
#     - Old E2E tests such as:                                                                                                      #
#           ```bash                                                                                                                 #
#           pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'                     #
#           ```                                                                                                                     #
#       were removed in https://github.com/vllm-project/vllm/pull/33293 in favor of new tests in `fusions_e2e`. We                  #
#       avoid replicating the new jobs in this file as it's deprecated.                                                             #
#   * [Basic Models Tests (Extra Initialization) %N]: Only when vLLM model source is modified - test initialization of a            #
#                                                     large subset of supported models (the complement of the small subset in       #
#                                                     the above test.) Also run if model initialization test file is modified.      #
#   * [Language Models Tests (Extra Standard) %N]: Shard slow subset of standard language models tests. Only run when model         #
#                                                  source is modified, or when specified test files are modified.                   #
#   * [Language Models Tests (Hybrid) %N]: Install fast path packages for testing against transformers (mamba, conv1d) and to       #
#                                          run plamo2 model in vLLM.                                                                #
#   * [Language Models Test (Extended Generation)]: Install fast path packages for testing against transformers (mamba, conv1d)     #
#                                                   and to run plamo2 model in vLLM.                                                #
#   * [Multi-Modal Models (Standard) 1-4]:                                                                                          #
#     - Do NOT remove `VLLM_WORKER_MULTIPROC_METHOD=spawn` setting as ROCm requires this for certain models to function.            #
#   * [Transformers Nightly Models]: Whisper needs `VLLM_WORKER_MULTIPROC_METHOD=spawn` to avoid deadlock.                          #
#   * [Plugin Tests (2 GPUs)]:                                                                                                      #
#     - {`pytest -v -s entrypoints/openai/test_oot_registration.py`}: It needs a clean process                                      #
#     - {`pytest -v -s models/test_oot_registration.py`}: It needs a clean process                                                  #
#     - {`pytest -v -s plugins/lora_resolvers`}: Unit tests for in-tree lora resolver plugins                                       #
#   * [LoRA TP (Distributed)]:                                                                                                      #
#     - There is some Tensor Parallelism related processing logic in LoRA that requires multi-GPU testing for validation.           #
#     - {`pytest -v -s -x lora/test_gptoss_tp.py`}: Disabled for now because MXFP4 backend on non-cuda platform doesn't support     #
#                                                   LoRA yet.                                                                       #
#   * [Distributed Tests (NxGPUs)(HW-TAG)]: Don't test llama model here, it seems hf implementation is buggy. See:                  #
#                                           https://github.com/vllm-project/vllm/pull/5689                                          #
#   * [Distributed Tests (NxGPUs)(HW-TAG)]: Some old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293     #
#                                           in favor of new tests in fusions_e2e. We avoid replicating the new jobs in              #
#                                           this file as it's deprecated.                                                           #
#                                                                                                                                   #
#####################################################################################################################################


steps:


#####################################################################################################################################
#                                                                                                                                   #
#  MI250 test definitions ( currently the test set is completely mirrored // TBD which tests are to be routed there ultimately)     #
#                                                                                                                                   #
#####################################################################################################################################

- label: Pytorch Nightly Dependency Override Check # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  soft_fail: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - requirements/nightly_torch_test.txt
  - vllm/platforms/rocm.py
  commands:
  - bash standalone_tests/pytorch_nightly_dependency.sh


- label: Async Engine, Inputs, Utils, Worker # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/detokenizer
  - tests/multimodal
  - tests/utils_
  commands:
  - pytest -v -s detokenizer
  - pytest -v -s -m 'not cpu_test' multimodal
  - pytest -v -s utils_


- label: Async Engine, Inputs, Utils, Worker, Config (CPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  no_gpu: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/test_inputs.py
  - tests/test_outputs.py
  - tests/test_pooling_params.py
  - tests/test_ray_env.py
  - tests/multimodal
  - tests/renderers
  - tests/standalone_tests/lazy_imports.py
  - tests/tokenizers_
  - tests/tool_parsers
  - tests/transformers_utils
  - tests/config
  commands:
  - python3 standalone_tests/lazy_imports.py
  - pytest -v -s test_inputs.py
  - pytest -v -s test_outputs.py
  - pytest -v -s test_pooling_params.py
  - pytest -v -s test_ray_env.py
  - pytest -v -s -m 'cpu_test' multimodal
  - pytest -v -s renderers
  - pytest -v -s tokenizers_
  - pytest -v -s tool_parsers
  - pytest -v -s transformers_utils
  - pytest -v -s config


- label: Python-only Installation # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - tests/standalone_tests/python_only_compile.sh
  - setup.py
  - vllm/platforms/rocm.py
  commands:
  - bash standalone_tests/python_only_compile.sh


- label: Basic Correctness # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/basic_correctness/test_basic_correctness
  - tests/basic_correctness/test_cpu_offload
  - tests/basic_correctness/test_cumem.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s basic_correctness/test_cumem.py
  - pytest -v -s basic_correctness/test_basic_correctness.py
  - pytest -v -s basic_correctness/test_cpu_offload.py


- label: Entrypoints Unit Tests # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  fast_check: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/entrypoints
  - tests/entrypoints/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s entrypoints/openai/tool_parsers
  - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/rpc --ignore=entrypoints/sleep --ignore=entrypoints/serve/instrumentator --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py  --ignore=entrypoints/pooling


- label: Entrypoints Integration (LLM) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py
  - pytest -v -s entrypoints/offline_mode


- label: Entrypoints Integration (API Server 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/rpc
  - tests/entrypoints/serve/instrumentator
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/serve/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use


- label: Entrypoints Integration (Responses API) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai/responses
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/responses


- label: EPLB Algorithm # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_algo.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_eplb_algo.py


- label: EPLB Execution # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/eplb
  - tests/distributed/test_eplb_execute.py
  - tests/distributed/test_eplb_spec_decode.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_eplb_execute.py
  - pytest -v -s distributed/test_eplb_spec_decode.py


- label: Elastic EP Scaling Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/compilation/
  - tests/distributed/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_elastic_ep.py


- label: Metrics, Tracing (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/tracing
  commands:
  - "pip install \
      'opentelemetry-sdk>=1.26.0' \
      'opentelemetry-api>=1.26.0' \
      'opentelemetry-exporter-otlp>=1.26.0' \
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
  - pytest -v -s v1/tracing


- label: Regression # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py


- label: Engine # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py


- label: Engine (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/
  - tests/v1/engine/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/engine/test_preprocess_error_handling.py
  - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py


- label: e2e Scheduling (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/
  - tests/v1/e2e/general/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/general/test_async_scheduling.py


- label: e2e Core (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/
  - tests/v1/e2e/general/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py


- label: Spec Decode Speculators + MTP # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - vllm/transformers_utils/configs/speculators/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"


- label: Spec Decode Ngram + Suffix # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"


- label: Spec Decode Draft Model # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"


- label: V1 e2e (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/e2e
  commands:
    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"


- label: V1 Sample + Logits # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/sample
  - tests/v1/logits_processors
  - tests/v1/test_oracle.py
  - tests/v1/test_request.py
  - tests/v1/test_outputs.py
  commands:
  - pytest -v -s v1/sample
  - pytest -v -s v1/logits_processors
  - pytest -v -s v1/test_oracle.py
  - pytest -v -s v1/test_request.py
  - pytest -v -s v1/test_outputs.py


- label: V1 Core + KV + Metrics # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/core
  - tests/v1/executor
  - tests/v1/kv_offload
  - tests/v1/worker
  - tests/v1/kv_connector/unit
  - tests/v1/metrics
  - tests/entrypoints/openai/correctness/test_lmeval.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - pytest -v -s -m 'not cpu_test' v1/core
  - pytest -v -s v1/executor
  - pytest -v -s v1/kv_offload
  - pytest -v -s v1/worker
  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
  - pytest -v -s -m 'not cpu_test' v1/metrics
  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine


- label: V1 Speculative Decoding (slow) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/
  - vllm/v1/attention/
  - vllm/model_executor/layers/
  - tests/v1/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py


- label: V1 attention (H100-MI250) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/config/attention.py
  - vllm/model_executor/layers/attention
  - vllm/v1/attention
  - tests/v1/attention
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/attention


- label: V1 others (CPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  no_gpu: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1
  commands:
  - pytest -v -s -m 'cpu_test' v1/core
  - pytest -v -s v1/structured_output
  - pytest -v -s v1/test_serial_utils.py
  - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
  - pytest -v -s -m 'cpu_test' v1/metrics


- label: Examples # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  - vllm/platforms/rocm.py
  commands:
    - pip install tensorizer
    # Basic
    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 basic/offline_inference/classify.py
    - python3 basic/offline_inference/embed.py
    - python3 basic/offline_inference/score.py
    # Multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # Pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # Features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536


- label: Platform Tests (CUDA) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
  - pytest -v -s cuda/test_cuda_context.py
  - pytest -v -s cuda/test_platform_no_cuda_init.py


- label: Samplers Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/layers
  - vllm/sampling_metadata.py
  - vllm/v1/sample/
  - vllm/beam_search.py
  - tests/samplers
  - tests/conftest.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s samplers


- label: LoRA %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  parallelism: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_llm_with_multi_loras.py --ignore=lora/test_olmoe_tp.py --ignore=lora/test_deepseekv2_tp.py --ignore=lora/test_gptoss_tp.py --ignore=lora/test_qwen3moe_tp.py


- label: PyTorch Compilation Unit Tests # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers/
  - vllm/v1/worker/
  - vllm/v1/attention/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - csrc/
  - tests/compile
  - vllm/platforms/rocm.py
  commands:
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"


- label: PyTorch Fullgraph Smoke Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/
  - vllm/v1/attention/
  - vllm/config/compilation.py
  - csrc/
  - tests/compile
  - vllm/platforms/rocm.py
  commands:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\\\;"


- label: PyTorch Fullgraph # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/
  - vllm/v1/attention/
  - vllm/config/compilation.py
  - csrc/
  - tests/compile
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'


- label: Cudagraph # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - tests/v1/cudagraph
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/config/compilation.py
  - vllm/compilation
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
  - pytest -v -s v1/cudagraph/test_cudagraph_mode.py


- label: Kernels Core Operation Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  - vllm/model_executor/layers/rotary_embedding/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/core kernels/test_top_k_per_row.py


- label: Kernels Mamba Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/mamba/
  - tests/kernels/mamba
  - vllm/model_executor/layers/mamba/ops
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/mamba


- label: Kernels Helion Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/utils/import_utils.py
  - tests/kernels/helion/
  - vllm/platforms/rocm.py
  commands:
  - pip install helion==0.3.3
  - pytest -v -s kernels/helion/


- label: Model Executor # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/engine/arg_utils.py
  - vllm/config/model.py
  - vllm/model_executor
  - tests/model_executor
  - tests/entrypoints/openai/completion/test_tensorizer_entrypoint.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - apt-get update && apt-get install -y curl libsodium23
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s model_executor -m '(not slow_test)'
  - pytest -v -s entrypoints/openai/completion/test_tensorizer_entrypoint.py


- label: Benchmarks # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  - vllm/platforms/rocm.py
  commands:
  - bash scripts/run-benchmarks.sh


- label: Benchmarks CLI Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/benchmarks/
  commands:
  - pytest -v -s benchmarks/


- label: OpenAI API correctness # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/entrypoints/openai/
  - vllm/model_executor/models/whisper.py
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/model_executor/model_loader/
  commands:
  - bash ../tools/install_torchcodec_rocm.sh || exit 1
  - pytest -s entrypoints/openai/correctness/


- label: Basic Models Tests (Initialization) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  commands:
  - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset


- label: Basic Models Tests (Extra Initialization) %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/layers/
  - tests/models/test_initialization.py
  - tests/models/registry.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s models/test_initialization.py -k 'not test_can_initialize_small_subset' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB


- label: Basic Models Tests (Other) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/test_terratorch.py
  - tests/models/test_transformers.py
  - tests/models/test_registry.py
  commands:
  - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py


- label: Basic Models Test (Other CPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  no_gpu: true
  optional: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/test_utils.py
  - tests/models/test_vision.py
  commands:
  - pytest -v -s models/test_utils.py models/test_vision.py


- label: Language Models Tests (Extra Standard) %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - tests/models/language/pooling/test_embedding.py
  - tests/models/language/generation/test_common.py
  - tests/models/language/pooling/test_classification.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pip freeze | grep -E 'torch'
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s models/language -m 'core_model and slow_test' --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB


- label: Language Models Test (PPL) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation_ppl_test
  commands:
  - pytest -v -s models/language/generation_ppl_test


- label: Language Models Test (Extended Pooling)  # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
  - pytest -v -s models/language/pooling -m 'not core_model'


- label: Language Models Test (MTEB) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling_mteb_test
  commands:
  - pytest -v -s models/language/pooling_mteb_test


- label: Multi-Modal Processor (CPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  no_gpu: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py


- label: Multi-Modal Accuracy Eval (Small Models) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  optional: true
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/multimodal/
  - vllm/inputs/
  - vllm/v1/core/
  - vllm/platforms/rocm.py
  - vllm/model_executor/model_loader/
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1


- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model


- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model


- label: Multi-Modal Models (Extended Generation 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
  - pytest -v -s models/multimodal/test_mapping.py


- label: Multi-Modal Models (Extended Generation 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'


- label: Multi-Modal Models (Extended Generation 3) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'


- label: Multi-Modal Models (Extended Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/pooling
  commands:
  - pytest -v -s models/multimodal/pooling -m 'not core_model'


- label: Distributed Comm Ops # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed
  - tests/distributed
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_comm_ops.py
  - pytest -v -s distributed/test_shm_broadcast.py
  - pytest -v -s distributed/test_shm_buffer.py
  - pytest -v -s distributed/test_shm_storage.py


- label: Distributed DP Tests (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/v1/distributed
  - tests/entrypoints/openai/test_multi_api_servers.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py


- label: Distributed Compile + RPC Tests (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/entrypoints/llm/test_collective_rpc.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py


- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/distributed/
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py


- label: Distributed Model Tests (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)'
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'


- label: Plugin Tests (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/plugins/
  - tests/plugins/
  - vllm/platforms/rocm.py
  commands:
  # BEGIN: platform plugin and general plugin tests, all the code in-between runs on dummy platform
  - pip install -e ./plugins/vllm_add_dummy_platform
  - pytest -v -s plugins_tests/test_platform_plugins.py
  - pip uninstall vllm_add_dummy_platform -y
  # END: platform plugin tests
  # BEGIN: `io_processor` plugins test, all the code in between uses the `prithvi_io_processor` plugin
  - pip install -e ./plugins/prithvi_io_processor_plugin
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
  - pip uninstall prithvi_io_processor_plugin -y
  # END: `io_processor` plugins test
  # BEGIN: `bge_m3_sparse io_processor` test
  - pip install -e ./plugins/bge_m3_sparse_plugin
  - pytest -v -s plugins_tests/test_bge_m3_sparse_io_processor_plugins.py
  - pip uninstall bge_m3_sparse_plugin -y
  # END: `bge_m3_sparse io_processor` test
  # BEGIN: `stat_logger` plugins test
  - pip install -e ./plugins/vllm_add_dummy_stat_logger
  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
  - pip uninstall dummy_stat_logger -y
  # END: `stat_logger` plugins test
  # BEGIN: other tests
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
  - pip install -e ./plugins/vllm_add_dummy_model
  - pytest -v -s distributed/test_distributed_oot.py
  - pytest -v -s entrypoints/openai/chat_completion/test_oot_registration.py
  - pytest -v -s models/test_oot_registration.py
  - pytest -v -s plugins/lora_resolvers
  # END: other tests


- label: Pipeline + Context Parallelism (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/model_executor/models/
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - tests/distributed/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py


- label: Ray Dependency Compatibility Check # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_1
  working_dir: "/"
  source_file_dependencies:
  - requirements/
  - setup.py
  - vllm/platforms/rocm.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh


- label: Distributed NixlConnector PD accuracy (4 GPUs)  # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - vllm/v1/worker/kv_connector_model_runner_mixin.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh


- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: Hyrbid SSM NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi250_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - HYBRID_SSM=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: Distributed Tests (2 GPUs)(H100-MI250) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx90anightly, amdmi250]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/v1/distributed/
  - vllm/model_executor/layers/fused_moe/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - tests/distributed/test_context_parallel.py
  - examples/offline_inference/data_parallel.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s tests/distributed/test_context_parallel.py
  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization


#####################################################################################################################################
#                                                                                                                                   #
#                                                             gfx942                                                                #
#                                                                                                                                   #
#####################################################################################################################################


- label: Entrypoints Integration (LLM) # 13.1m
  timeout_in_minutes: 22
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/llm
  - tests/entrypoints/offline_mode
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
  - pytest -v -s entrypoints/llm/test_generate.py
  - pytest -v -s entrypoints/offline_mode


- label: Entrypoints Integration (API Server openai - Part 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py


- label: Entrypoints Integration (API Server openai - Part 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
  - pytest -v -s entrypoints/openai/speech_to_text/
  - pytest -v -s entrypoints/test_chat_utils.py


- label: Entrypoints Integration (API Server openai - Part 3) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses


- label: Entrypoints Integration (API Server 2) #26.9m
  timeout_in_minutes: 45
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/rpc
  - tests/entrypoints/serve/instrumentator
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/serve/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use


- label: Entrypoints Integration (Pooling) # 22.8m
  timeout_in_minutes: 48
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling


- label: Distributed Torchrun + Examples (4 GPUs) # TBD
  timeout_in_minutes: 80
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_torchrun_example.py
  - tests/distributed/test_torchrun_example_moe.py
  - examples/rl/
  - tests/examples/offline_inference/data_parallel.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
  # rlhf examples
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_nccl.py
  - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 ../examples/rl/rlhf_ipc.py


- label: Distributed DP Tests (4 GPUs) # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - tests/v1/distributed
  - tests/v1/engine/test_engine_core_client.py
  - tests/distributed/test_utils
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
  - pytest -v -s distributed/test_utils.py


- label: Distributed Compile + Comm (4 GPUs) # TBD
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - tests/distributed/test_pynccl
  - tests/distributed/test_events
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/distributed/test_symm_mem_allreduce.py
  - tests/distributed/test_multiproc_executor.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s compile/fullgraph/test_basic_correctness.py
  - pytest -v -s distributed/test_pynccl.py
  - pytest -v -s distributed/test_events.py
  - pytest -v -s distributed/test_symm_mem_allreduce.py
  - pytest -v -s distributed/test_multiproc_executor.py::test_multiproc_executor_multi_node


- label: Distributed Tests (8 GPUs)(H100-MI325) # 6.4m
  timeout_in_minutes: 10
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_8
  num_gpus: 8
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - examples/offline_inference/torchrun_dp_example.py
  - vllm/config/parallel.py
  - vllm/distributed/
  - vllm/v1/engine/llm_engine.py
  - vllm/v1/executor/uniproc_executor.py
  - vllm/v1/worker/gpu_worker.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep


- label: Elastic EP Scaling Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/compilation/
  - tests/distributed/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s distributed/test_elastic_ep.py


- label: Engine # 11.3m
  timeout_in_minutes: 35
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/engine
  - tests/test_sequence
  - tests/test_config
  - tests/test_logger
  - tests/test_vllm_port
  commands:
  - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py


- label: Engine (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/engine/
  - tests/v1/engine/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/engine/test_preprocess_error_handling.py
  - pytest -v -s v1/engine --ignore v1/engine/test_preprocess_error_handling.py


- label: e2e Scheduling (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/
  - tests/v1/e2e/general/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/general/test_async_scheduling.py


- label: e2e Core (1 GPU) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/
  - tests/v1/e2e/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/general --ignore v1/e2e/general/test_async_scheduling.py


- label: Spec Decode Eagle # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "eagle_correctness"


- label: Spec Decode Speculators + MTP # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - vllm/transformers_utils/configs/speculators/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "speculators or mtp_correctness"


- label: Spec Decode Ngram + Suffix # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "ngram or suffix"


- label: Spec Decode Draft Model # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/v1/worker/gpu/spec_decode/
  - vllm/model_executor/model_loader/
  - vllm/v1/sample/
  - vllm/model_executor/layers/
  - tests/v1/e2e/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/e2e/spec_decode -k "draft_model or no_sync or batch_inference"


- label: V1 e2e (2 GPUs) # 7.1m
  timeout_in_minutes: 12
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/e2e
  commands:
    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "tensor_parallelism"


- label: V1 e2e (4 GPUs) # 52.6m
  timeout_in_minutes: 106
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/e2e
  commands:
    - pytest -v -s v1/e2e/spec_decode/test_spec_decode.py -k "eagle_correctness_heavy"


- label: V1 e2e (4xH100-4xMI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  optional: true
  source_file_dependencies:
    - vllm/v1/attention/backends/utils.py
    - vllm/v1/worker/gpu_model_runner.py
    - tests/v1/e2e/test_hybrid_chunked_prefill.py
  commands:
    - pytest -v -s v1/e2e/test_hybrid_chunked_prefill.py


- label: V1 Spec Decode # TBD
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/spec_decode
  commands:
  - pytest -v -s -m 'not slow_test' v1/spec_decode


- label: V1 Sample + Logits # TBD
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/sample
  - tests/v1/logits_processors
  - tests/v1/test_oracle.py
  - tests/v1/test_request.py
  - tests/v1/test_outputs.py
  commands:
  - pytest -v -s v1/sample
  - pytest -v -s v1/logits_processors
  - pytest -v -s v1/test_oracle.py
  - pytest -v -s v1/test_request.py
  - pytest -v -s v1/test_outputs.py


- label: V1 Core + KV + Metrics # TBD
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/core
  - tests/v1/executor
  - tests/v1/kv_offload
  - tests/v1/worker
  - tests/v1/kv_connector/unit
  - tests/v1/metrics
  - tests/entrypoints/openai/correctness/test_lmeval.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - pytest -v -s -m 'not cpu_test' v1/core
  - pytest -v -s v1/executor
  - pytest -v -s v1/kv_offload
  - pytest -v -s v1/worker
  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
  - pytest -v -s -m 'not cpu_test' v1/metrics
  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
  # - export HSA_NO_SCRATCH_RECLAIM=1
  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine


- label: V1 Speculative Decoding (slow) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/
  - vllm/v1/attention/
  - vllm/model_executor/layers/
  - tests/v1/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py


- label: Acceptance Length Test (Large Models) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/mlp_speculator.py
  - tests/v1/spec_decode/test_acceptance_length.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_ALLOW_INSECURE_SERIALIZATION=1
  - pytest -v -s v1/spec_decode/test_acceptance_length.py -m slow_test


- label: V1 attention (H100-MI325) # 14.5m
  timeout_in_minutes: 40
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/config/attention.py
  - vllm/model_executor/layers/attention
  - vllm/v1/attention
  - tests/v1/attention
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/attention


- label: Batch Invariance (H100-MI325) # 5.2m
  timeout_in_minutes: 12
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/attention
  - vllm/model_executor/layers
  - tests/v1/determinism/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pip install pytest-timeout pytest-forked
  - pytest -v -s v1/determinism/test_batch_invariance.py
  - pytest -v -s v1/determinism/test_rms_norm_batch_invariant.py


- label: V1 others (CPU) # 10.4m
  timeout_in_minutes: 28
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  no_gpu: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1
  commands:
  - pytest -v -s -m 'cpu_test' v1/core
  - pytest -v -s v1/structured_output
  - pytest -v -s v1/test_serial_utils.py
  - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
  - pytest -v -s -m 'cpu_test' v1/metrics


- label: Examples # 24.5m
  timeout_in_minutes: 55
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  - vllm/platforms/rocm.py
  commands:
    - pip install tensorizer
    # Basic
    - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
    - python3 basic/offline_inference/generate.py --model facebook/opt-125m
    - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
    - python3 basic/offline_inference/classify.py
    - python3 basic/offline_inference/embed.py
    - python3 basic/offline_inference/score.py
    # Multi-modal models
    - python3 offline_inference/audio_language.py --seed 0
    - python3 offline_inference/vision_language.py --seed 0
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # Pooling models
    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # Features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536


- label: Platform Tests (CUDA) # 5.0m
  timeout_in_minutes: 9
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/cuda
  commands:
  - pytest -v -s cuda/test_cuda_context.py
  - pytest -v -s cuda/test_platform_no_cuda_init.py


- label: PyTorch Compilation Passes Unit Tests # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/compile/passes
  commands:
  - pytest -s -v compile/passes --ignore compile/passes/distributed


- label: Kernels Core Operation Test # 26.8m
  timeout_in_minutes: 38
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - tests/kernels/core
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  - vllm/model_executor/layers/rotary_embedding/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/core kernels/test_top_k_per_row.py


- label: Kernels Attention Test %N # 17.7m
  timeout_in_minutes: 28
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/attention/
  - vllm/v1/attention
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels Quantization Test %N # 15.2m
  timeout_in_minutes: 24
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  - tests/kernels/quantization/test_rocm_skinny_gemms.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/model_executor/kernels/
  commands:
  - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels MoE Test %N # TBD
  timeout_in_minutes: 19
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  parallelism: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  - vllm/envs.py
  - vllm/config
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels FP8 MoE Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/moe/
  - csrc/quantization/w8a8/cutlass/moe/
  - vllm/model_executor/layers/fused_moe/
  - tests/kernels/moe/test_deepep_moe.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/envs.py
  commands:
    - pytest -v -s kernels/moe/test_deepep_moe.py


- label: ROCm AITER Ops Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  - tests/rocm/aiter/
  - vllm/v1/attention/backends/mla/rocm_aiter_mla.py
  - vllm/v1/attention/selector.py
  commands:
  - pytest -v -s rocm/aiter/


- label: Benchmarks # 8.2m
  timeout_in_minutes: 20
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/.buildkite"
  source_file_dependencies:
  - benchmarks/
  - vllm/platforms/rocm.py
  commands:
  - bash scripts/run-benchmarks.sh


- label: Quantization # 36.1m
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/quantization
  commands:

  # temporary install here since we need nightly, will move to requirements/test.in
  # after torchao 0.12 release, and pin a working version of torchao nightly here

  # since torchao nightly is only compatible with torch nightly currently
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
  # we can only upgrade after this is resolved
  # TODO(jerryzh168): resolve the above comment
  - uv pip install --system torchao==0.17.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py


- label: Language Models Tests (Standard) # 22.8m
  timeout_in_minutes: 38
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
  - pip freeze | grep -E 'torch'
  - pytest -v -s models/language -m 'core_model and (not slow_test)'


- label: Language Models Tests (Hybrid) %N # 34.9m
  timeout_in_minutes: 55
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  torch_nightly: true
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
  - pytest -v -s models/language/generation -m hybrid_model --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --shard-id=$$BUILDKITE_PARALLEL_JOB


- label: Language Models Test (Extended Generation) # 32.2m
  timeout_in_minutes: 55
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
  - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'


- label: Multi-Modal Processor # 1h 42m
  timeout_in_minutes: 138
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  - tests/models/registry.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/processing/test_tensor_schema.py


- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model


- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model


- label: Multi-Modal Models (Extended Generation 1) # 1h 2m
  timeout_in_minutes: 106
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
  - pytest -v -s models/multimodal/test_mapping.py


- label: Multi-Modal Models (Extended Generation 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'


- label: Multi-Modal Models (Extended Generation 3) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'


- label: Multi-Modal Models (Extended Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/pooling
  commands:
  - pytest -v -s models/multimodal/pooling -m 'not core_model'


- label: Quantized Models Test # 21.4m
  timeout_in_minutes: 38
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/models/quantization
  - vllm/model_executor/model_loader/
  commands:
  - pytest -v -s models/quantization


- label: Transformers Nightly Models # 50.9m
  timeout_in_minutes: 102
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/multimodal/
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/models/
  - examples/
  commands:
  - pip install --upgrade git+https://github.com/huggingface/transformers
  - pytest -v -s tests/models/test_initialization.py
  - pytest -v -s tests/models/test_transformers.py
  - pytest -v -s tests/models/multimodal/processing/
  - pytest -v -s tests/models/multimodal/test_mapping.py
  - python3 examples/basic/offline_inference/chat.py
  - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
  - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper


- label: Quantized MoE Test (B200-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - tests/quantization/test_gfx3xx_moe.py
  - vllm/model_executor/models/deepseek_v2.py
  - vllm/model_executor/models/gpt_oss.py
  - vllm/model_executor/models/llama4.py
  - vllm/model_executor/layers/fused_moe
  - vllm/model_executor/layers/quantization/compressed_tensors
  - vllm/model_executor/layers/quantization/modelopt.py
  - vllm/model_executor/layers/quantization/mxfp4.py
  - vllm/v1/attention/backends/triton_attn.py
  - vllm/v1/attention/backends/rocm_attn.py
  - vllm/v1/attention/backends/rocm_aiter_fa.py
  - vllm/v1/attention/backends/mla/
  - vllm/v1/attention/selector.py
  - vllm/model_executor/layers/layernorm.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/model_executor/model_loader/
  commands:
  - pytest -s -v tests/quantization/test_gfx3xx_moe.py


- label: Distributed DP Tests (2 GPUs) # 56.1m
  timeout_in_minutes: 102
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/v1/distributed
  - tests/entrypoints/openai/test_multi_api_servers.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
  - DP_SIZE=2 pytest -v -s entrypoints/openai/test_multi_api_servers.py


- label: Distributed Compile + RPC Tests (2 GPUs) # 56.1m
  timeout_in_minutes: 102
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/compile/fullgraph/test_basic_correctness.py
  - tests/compile/test_wrapper.py
  - tests/entrypoints/llm/test_collective_rpc.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
  - pytest -v -s ./compile/fullgraph/test_basic_correctness.py
  - pytest -v -s ./compile/test_wrapper.py


- label: Distributed Torchrun + Shutdown Tests (2 GPUs) # 56.1m
  timeout_in_minutes: 102
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/engine/
  - vllm/executor/
  - vllm/worker/worker_base.py
  - vllm/v1/engine/
  - vllm/v1/worker/
  - tests/distributed/
  - tests/v1/shutdown
  - tests/v1/worker/test_worker_memory_snapshot.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py


- label: Distributed Model Tests (2 GPUs) # 19.3m
  timeout_in_minutes: 38
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/model_loader/sharded_state_loader.py
  - vllm/model_executor/models/
  - vllm/model_executor/layers/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/basic_correctness/
  - tests/model_executor/model_loader/test_sharded_state_loader.py
  - tests/models/
  commands:
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py -m '(not slow_test)'
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'


- label: LoRA TP (Distributed) # 9.8m
  timeout_in_minutes: 18
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/lora
  - tests/lora
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
  - pytest -v -s -x lora/test_chatglm3_tp.py
  - pytest -v -s -x lora/test_llama_tp.py
  - pytest -v -s -x lora/test_llm_with_multi_loras.py
  - pytest -v -s -x lora/test_olmoe_tp.py
  - pytest -v -s -x lora/test_gptoss_tp.py
  - pytest -v -s -x lora/test_qwen35_densemodel_lora.py


- label: Weight Loading Multiple GPU # 7.5m
  timeout_in_minutes: 14
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt


- label: Weight Loading Multiple GPU - Large Models # 12.6m
  timeout_in_minutes: 26
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt


- label: Ray Dependency Compatibility Check # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  optional: true
  working_dir: "/"
  source_file_dependencies:
  - requirements/
  - setup.py
  - vllm/platforms/rocm.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh


- label: Distributed NixlConnector PD accuracy (4 GPUs)  # 27.4m
  timeout_in_minutes: 44
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: CrossLayer KV layout Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - CROSS_LAYERS_BLOCKS=True ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: Distributed Tests (4 GPUs)(A100-MI325) # 20.9m
  timeout_in_minutes: 37
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s distributed/test_custom_all_reduce.py
  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py


- label: Distributed Tests (2 GPUs)(H100-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/v1/distributed/
  - vllm/model_executor/layers/fused_moe/
  - tests/v1/distributed/test_dbo.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s tests/v1/distributed/test_dbo.py


- label: Distributed Compile Unit Tests (2xH100-2xMI325) # 14.3m
  timeout_in_minutes: 32
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
  - tests/compile/passes/distributed/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
  - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
  # TODO: this test is not supported on ROCm, there are aiter kernels for this.
  # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
  # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
  # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"


- label: LM Eval Small Models # 13.3m
  timeout_in_minutes: 23
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt


- label: LM Eval Small Models (MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small-rocm.txt


- label: LM Eval Small Models (B200-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt


- label: LM Eval Large Models (H200-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_8
  optional: true
  num_gpus: 8
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/model_executor/layers/layernorm.py
  - csrc/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/evals/
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx.txt


- label: LM Eval Large Models (4 GPUs)(FP8) # 24.8m
  timeout_in_minutes: 42
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_USE_DEEP_GEMM=0
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4


- label: LM Eval Large Models (4 GPUs)(A100-MI325) # 17.3m
  timeout_in_minutes: 27
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4


- label: ROCm LM Eval Large Models (8 Card) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_8
  optional: true
  num_gpus: 8
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/model_executor/layers/layernorm.py
  - csrc/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm.txt --tp-size=8


- label: GPQA Eval (GPT-OSS) (H100-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/model_executor/layers/fused_moe/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - tests/evals/gpt_oss/
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx942.txt


- label: DeepSeek V2-Lite Accuracy # 6.7m
  timeout_in_minutes: 12
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/distributed/eplb
  - vllm/model_executor/layers/fused_moe/
  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/backends/mla/
  - vllm/v1/attention/selector.py
  - .buildkite/scripts/scheduled_integration_test/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010


- label: DeepSeek V2-Lite Prefetch Offload Accuracy (H100-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  num_gpus: 1
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/fused_moe/
  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/backends/mla/
  - vllm/v1/attention/selector.py
  - .buildkite/scripts/scheduled_integration_test/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_prefetch_offload.sh 0.25 200 8030


- label: Qwen3-30B-A3B-FP8-block Accuracy # 6.4m
  timeout_in_minutes: 11
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  optional: true
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/quantization/
  - vllm/distributed/eplb
  - vllm/model_executor/layers/fused_moe/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - .buildkite/scripts/scheduled_integration_test/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020


- label: Qwen3-Next-80B-A3B-Instruct MTP Async EPLB Accuracy # 10.9m
  timeout_in_minutes: 22
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/spec_decode/
  - vllm/distributed/eplb
  - vllm/model_executor/layers/fused_moe/
  - vllm/model_executor/layers/quantization/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - .buildkite/scripts/scheduled_integration_test/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen3_next_mtp_async_eplb.sh 0.8 1319 8040

##### .buildkite/test_areas/compile.yaml #####
# Slowly setting up the tests so that it is also easier for the
# CI team to review and upstream to the pipelinev2.
# The following tests are important for vLLM IR Ops refactoring,
# which affects fusion passes on ROCm. So we have to
# enable them as as soon as possible.

## TODO: Enable the test in this group
# # corresponds to .buildkite/test_areas/compile.yaml
# - label: Fusion and Compile Unit Tests (2xB200-2xMI325) # TBD
#   timeout_in_minutes: 180
#   mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325, tj]
#   agent_pool: mi325_1 # changed to 1 GPU until the fusion all reduce is enabled then only revert back to 2 GPUs
#   num_gpus: 1
#   working_dir: "/vllm-workspace/"
#   source_file_dependencies:
#   - csrc/quantization/fp4/
#   - vllm/model_executor/layers/quantization/
#   - vllm/model_executor/layers/layernorm.py
#   - vllm/model_executor/layers/activation.py
#   - vllm/model_executor/layers/attention/attention.py
#   - vllm/v1/attention/backends/flashinfer.py
#   - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
#   - tests/compile/test_fusion_attn.py
#   - tests/compile/test_silu_mul_quant_fusion.py
#   - tests/compile/distributed/test_fusion_all_reduce.py
#   - tests/compile/fullgraph/test_full_graph.py
#   commands:
#     - rocm-smi
#     # we run all backend tests on ROCm
#     # These two tests are covered in "PyTorch Compilation Passes Unit Tests"
#     # - "pytest -v -s tests/compile/passes/test_fusion_attn.py"
#     # - "pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py"
#     # TODO: this test is not supported on ROCm, there are aiter kernels for this.
#     # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
#     # TODO: find out more details
#     # - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile


- label: Fusion E2E Quick (H100-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  num_gpus: 1
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/
  - vllm/v1/attention/
  - vllm/compilation/
  - tests/compile/fusions_e2e/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - rocm-smi
  # Run all models and attn backends but only Inductor partition and native custom ops
  - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and not +rms_norm and not +quant_fp8'"
  # Different from CUDA, Qwen requires +rms_norm and +quant_fp8 as rms+quant fusion is only supported on AITER
  - "pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k 'inductor_partition and +rms_norm and +quant_fp8 and qwen3'"


- label: Fusion E2E Config Sweep (H100-MI325) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx942nightly, amdmi325]
  agent_pool: mi325_1
  num_gpus: 1
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - csrc/quantization/
  - vllm/compilation/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/attention/attention.py
  - vllm/model_executor/layers/quantization/input_quant_fp8.py
  - tests/compile/fusions_e2e/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - rocm-smi
  - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"

## There are no ops on ROCm for these tests.
## The test still passes but the logs are not useful.
## fused ops just call torch.ops.symm_mem which
## exists in ROCm even though they don't work
# - label: AsyncTP Correctness Tests (2xH100-2xMI325)
# - label: Fusion E2E TP2 Quick (H100-MI325)
# - label: Fusion E2E TP2 AsyncTP Config Sweep (H100-MI325)
# - label: Fusion E2E TP2 (B200-MI325)
# - label: Sequence Parallel Correctness Tests (2xH100-2xMI325)


#####################################################################################################################################
#                                                                                                                                   #
#                                                             gfx950                                                                #
#                                                                                                                                   #
#####################################################################################################################################

- label: Entrypoints Integration (API Server openai - Part 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/chat_completion --ignore=entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/chat_completion/test_oot_registration.py


- label: Entrypoints Integration (API Server openai - Part 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai/completion --ignore=entrypoints/openai/completion/test_tensorizer_entrypoint.py
  - pytest -v -s entrypoints/openai/speech_to_text/
  - pytest -v -s entrypoints/test_chat_utils.py


- label: Entrypoints Integration (API Server openai - Part 3) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/openai
  - tests/entrypoints/test_chat_utils
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/chat_completion --ignore=entrypoints/openai/completion --ignore=entrypoints/openai/speech_to_text/ --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/tool_parsers/ --ignore=entrypoints/openai/responses


- label: Entrypoints Integration (API Server 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/rpc
  - tests/entrypoints/serve/instrumentator
  - tests/tool_use
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/serve/instrumentator
  - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/rpc
  - pytest -v -s tool_use


- label: Entrypoints Integration (Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  fast_check: true
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/entrypoints/pooling
  commands:
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
  - pytest -v -s entrypoints/pooling


- label: Regression # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/test_regression
  commands:
  - pip install modelscope
  - pytest -v -s test_regression.py


- label: V1 Spec Decode # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/spec_decode
  commands:
  - pytest -v -s -m 'not slow_test' v1/spec_decode


- label: V1 Sample + Logits # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/sample
  - tests/v1/logits_processors
  - tests/v1/test_oracle.py
  - tests/v1/test_request.py
  - tests/v1/test_outputs.py
  commands:
  - pytest -v -s v1/sample
  - pytest -v -s v1/logits_processors
  - pytest -v -s v1/test_oracle.py
  - pytest -v -s v1/test_request.py
  - pytest -v -s v1/test_outputs.py


- label: V1 Core + KV + Metrics # TBD
  timeout_in_minutes: 60
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/v1/core
  - tests/v1/executor
  - tests/v1/kv_offload
  - tests/v1/worker
  - tests/v1/kv_connector/unit
  - tests/v1/metrics
  - tests/entrypoints/openai/correctness/test_lmeval.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - pytest -v -s -m 'not cpu_test' v1/core
  - pytest -v -s v1/executor
  - pytest -v -s v1/kv_offload
  - pytest -v -s v1/worker
  - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
  - pytest -v -s -m 'not cpu_test' v1/metrics
  - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
  - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine


- label: V1 Speculative Decoding (slow) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/v1/spec_decode/
  - vllm/model_executor/models/
  - vllm/v1/attention/
  - vllm/model_executor/layers/
  - tests/v1/spec_decode/
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_eagle.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_extract_hidden_states.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_max_len.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_mtp.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_ngram.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_speculators_eagle3.py
  - pytest -v -s -m 'slow_test' v1/spec_decode/test_tree_attention.py


- label: V1 attention (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/config/attention.py
  - vllm/model_executor/layers/attention
  - vllm/v1/attention
  - tests/v1/attention
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s v1/attention


- label: Examples # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/examples"
  source_file_dependencies:
  - vllm/entrypoints
  - vllm/multimodal
  - examples/
  - vllm/platforms/rocm.py
  commands:
  - pip install tensorizer
  # Basic
  - python3 basic/offline_inference/chat.py --attention-backend TRITON_ATTN
  - python3 basic/offline_inference/generate.py --model facebook/opt-125m
  - python3 basic/offline_inference/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
  - python3 basic/offline_inference/classify.py
  - python3 basic/offline_inference/embed.py
  - python3 basic/offline_inference/score.py
  # Multi-modal models
  - python3 offline_inference/audio_language.py --seed 0
  - python3 offline_inference/vision_language.py --seed 0
  - python3 offline_inference/vision_language_multi_image.py --seed 0
  - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
  # Pooling models
  - python3 pooling/embed/vision_embedding_offline.py --seed 0
  # Features demo
  - python3 offline_inference/prefix_caching.py
  - python3 offline_inference/llm_engine_example.py
  - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
  - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
  - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536


- label: Kernels Attention Test %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  parallelism: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/attention/
  - vllm/v1/attention
  - vllm/model_executor/layers/attention
  - tests/kernels/attention
  - vllm/_aiter_ops.py
  - vllm/envs.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels Quantization Test %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  parallelism: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/quantization/
  - vllm/model_executor/layers/quantization
  - tests/kernels/quantization
  - tests/kernels/quantization/test_rocm_skinny_gemms.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/model_executor/kernels/
  commands:
  - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels MoE Test %N # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  parallelism: 4
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/quantization/cutlass_w8a8/moe/
  - csrc/moe/
  - tests/kernels/moe
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/device_communicators/
  - vllm/envs.py
  - vllm/config
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -v -s kernels/moe --ignore=kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
  - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT


- label: Kernels FP8 MoE Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/moe/
  - csrc/quantization/w8a8/cutlass/moe/
  - vllm/model_executor/layers/fused_moe/
  - tests/kernels/moe/test_deepep_moe.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/envs.py
  commands:
    - pytest -v -s kernels/moe/test_deepep_moe.py


- label: Quantization # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - tests/quantization
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system torchao==0.17.0
  - uv pip install --system conch-triton-kernels
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py


- label: Language Models Tests (Standard) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  torch_nightly: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language
  commands:
  - pip freeze | grep -E 'torch'
  - pytest -v -s models/language -m 'core_model and (not slow_test)'


- label: Language Models Test (Extended Generation) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/generation
  commands:
  - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr'
  - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
  - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'


- label: Language Models Test (Extended Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/language/pooling
  commands:
  - pytest -v -s models/language/pooling -m 'not core_model'


- label: "Multi-Modal Models (Standard) 1: qwen2" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen2"
  - pytest -v -s models/multimodal/generation/test_ultravox.py -m core_model


- label: "Multi-Modal Models (Standard) 2: qwen3 + gemma" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "qwen3 or gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_5_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 3: llava + qwen2_vl" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m core_model -k "not qwen2 and not qwen3 and not gemma"
  - pytest -v -s models/multimodal/generation/test_qwen2_vl.py -m core_model


- label: "Multi-Modal Models (Standard) 4: other + whisper" # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  torch_nightly: true
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/generation/test_ultravox.py --ignore models/multimodal/generation/test_qwen2_5_vl.py --ignore models/multimodal/generation/test_qwen2_vl.py --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
  - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model


- label: Multi-Modal Models (Extended Generation 1) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  - tests/models/multimodal/test_mapping.py
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation -m 'not core_model' --ignore models/multimodal/generation/test_common.py
  - pytest -v -s models/multimodal/test_mapping.py


- label: Multi-Modal Models (Extended Generation 2) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'


- label: Multi-Modal Models (Extended Generation 3) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/generation
  commands:
  - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
  - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'


- label: Multi-Modal Models (Extended Pooling) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/models/multimodal/pooling
  commands:
  - pytest -v -s models/multimodal/pooling -m 'not core_model'


- label: Quantized Models Test # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/layers/quantization
  - tests/models/quantization
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  - vllm/model_executor/model_loader/
  commands:
  - pytest -v -s models/quantization


- label: Kernels (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - csrc/quantization/fp4/
  - csrc/attention/mla/
  - csrc/quantization/cutlass_w8a8/moe/
  - vllm/model_executor/layers/fused_moe/cutlass_moe.py
  - vllm/v1/attention/backends/triton_attn.py
  - vllm/v1/attention/backends/rocm_attn.py
  - vllm/v1/attention/backends/rocm_aiter_fa.py
  - vllm/v1/attention/backends/rocm_aiter_unified_attn.py
  - vllm/v1/attention/backends/mla/aiter_triton_mla.py
  - vllm/v1/attention/backends/mla/rocm_aiter_mla.py
  - vllm/v1/attention/selector.py
  - vllm/platforms/rocm.py
  - vllm/_aiter_ops.py
  commands:
  - rocm-smi
  - python3 examples/basic/offline_inference/chat.py
  - pytest -v -s tests/kernels/attention/test_attention_selector.py


- label: Weight Loading Multiple GPU # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-amd.txt


- label: Weight Loading Multiple GPU - Large Models # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  optional: true
  source_file_dependencies:
  - vllm/
  - tests/weight_loading
  commands:
  - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large-amd.txt


- label: Ray Dependency Compatibility Check # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_1
  optional: true
  working_dir: "/"
  source_file_dependencies:
  - requirements/
  - setup.py
  - vllm/platforms/rocm.py
  commands:
  - bash /vllm-workspace/.buildkite/scripts/check-ray-compatibility.sh


- label: Distributed NixlConnector PD accuracy (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: DP EP Distributed NixlConnector PD accuracy tests (4 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - DP_EP=1 ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh


- label: NixlConnector PD + Spec Decode acceptance (2 GPUs) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
  - vllm/v1/worker/kv_connector_model_runner_mixin.py
  - tests/v1/kv_connector/nixl_integration/
  - vllm/platforms/rocm.py
  commands:
  - uv pip install --system -r /vllm-workspace/requirements/kv_connectors_rocm.txt
  - ROCM_ATTN=1 bash v1/kv_connector/nixl_integration/spec_decode_acceptance_test.sh


- label: Distributed Tests (2 GPUs)(H100-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/distributed/
  - vllm/v1/distributed/
  - vllm/model_executor/layers/fused_moe/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - tests/distributed/test_context_parallel.py
  - tests/v1/distributed/test_dbo.py
  - examples/offline_inference/data_parallel.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export TORCH_NCCL_BLOCKING_WAIT=1
  - pytest -v -s tests/distributed/test_context_parallel.py
  - VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
  - pytest -v -s tests/v1/distributed/test_dbo.py


- label: Distributed Compile Unit Tests (2xH100-2xMI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
  - tests/compile/passes/distributed/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/passes/distributed/test_async_tp.py
  - pytest -v -s tests/compile/passes/distributed/test_sequence_parallelism.py
  # TODO: this test is not supported on ROCm, there are aiter kernels for this.
  # - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
  # - pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
  # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"


- label: LM Eval Small Models (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-mi3xx-fp8-and-mixed.txt


- label: LM Eval Large Models (4 GPUs)(FP8) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_4
  num_gpus: 4
  optional: true
  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - export VLLM_USE_DEEP_GEMM=0
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-rocm-fp8.txt --tp-size=4


- label: GPQA Eval (GPT-OSS) (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx955nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - csrc/
  - vllm/model_executor/layers/quantization
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - vllm/model_executor/layers/fused_moe/
  - tests/evals/gpt_oss/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
    - uv pip install --system 'gpt-oss[eval]==0.0.5'
    - pytest -s -v evals/gpt_oss/test_gpqa_correctness.py --config-list-file=configs/models-gfx950.txt


- label: Qwen3-30B-A3B-FP8-block Accuracy (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  working_dir: "/vllm-workspace"
  source_file_dependencies:
  - vllm/model_executor/models/
  - vllm/model_executor/model_loader/
  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/fused_moe/
  - vllm/distributed/eplb
  - vllm/v1/attention/backends/
  - vllm/v1/attention/selector.py
  - .buildkite/scripts/scheduled_integration_test/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1


- label: Attention Benchmarks Smoke Test (B200-MI355) # TBD
  timeout_in_minutes: 180
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  working_dir: "/vllm-workspace/"
  source_file_dependencies:
  - benchmarks/attention_benchmarks/
  - vllm/v1/attention/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - python3 benchmarks/attention_benchmarks/benchmark.py --backends ROCM_ATTN ROCM_AITER_FA ROCM_AITER_UNIFIED_ATTN --batch-specs "8q1s1k" --repeats 1 --warmup-iters 1


- label: LM Eval Qwen3-5 Models (B200-MI355) # TBD
  timeout_in_minutes: 120
  mirror_hardwares: [amdexperimental, amdproduction, amdgfx950nightly, amdmi355]
  agent_pool: mi355_2
  num_gpus: 2
  optional: true
  working_dir: "/vllm-workspace/tests"
  source_file_dependencies:
  - vllm/model_executor/models/qwen3_5.py
  - vllm/model_executor/models/qwen3_5_mtp.py
  - vllm/transformers_utils/configs/qwen3_5.py
  - vllm/transformers_utils/configs/qwen3_5_moe.py
  - vllm/model_executor/models/qwen.py
  - vllm/model_executor/models/qwen2.py
  - vllm/model_executor/models/qwen3.py
  - vllm/model_executor/models/qwen3_next.py
  - vllm/model_executor/models/qwen3_next_mtp.py
  - vllm/model_executor/layers/fla/ops/
  - vllm/_aiter_ops.py
  - vllm/platforms/rocm.py
  commands:
  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-qwen35-mi355.txt