group: Compile
depends_on: 
  - image-build
steps:
- label: Sequence Parallel Correctness Tests (2 GPUs)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  num_devices: 2
  source_file_dependencies:
  - vllm/model_executor/layers/
  - vllm/compilation/
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - tests/compile/correctness_e2e/test_sequence_parallel.py
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: Sequence Parallel Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py

- label: AsyncTP Correctness Tests (2xH100)
  timeout_in_minutes: 50
  working_dir: "/vllm-workspace/"
  device: h100
  optional: true
  num_devices: 2
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -v -s tests/compile/correctness_e2e/test_async_tp.py

- label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
  - vllm/compilation/
  - vllm/model_executor/layers
  - tests/compile/passes/distributed/
  commands:
  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
  - pytest -s -v tests/compile/passes/distributed

- label: Fusion and Compile Unit Tests (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
  - vllm/model_executor/layers/attention/attention.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/passes/test_fusion_attn.py
  - tests/compile/passes/test_silu_mul_quant_fusion.py
  - tests/compile/passes/distributed/test_fusion_all_reduce.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
    - pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E Quick (H100)
  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

- label: Fusion E2E Config Sweep (H100)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 1
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"

- label: Fusion E2E Config Sweep (B200)
  timeout_in_minutes: 30
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 1
  optional: true
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"

- label: Fusion E2E TP2 Quick (H100)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models and attn backends but only Inductor partition and native custom ops
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"

- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"

- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
  device: h100
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/compilation/
    # can affect pattern matching
    - vllm/model_executor/layers/layernorm.py
    - vllm/model_executor/layers/activation.py
    - vllm/model_executor/layers/attention/attention.py
    - vllm/model_executor/layers/quantization/input_quant_fp8.py
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run just llama3 (fp8 & bf16) for all config combinations
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"

- label: Fusion E2E TP2 (B200)
  timeout_in_minutes: 20
  working_dir: "/vllm-workspace/"
  device: b200
  num_devices: 2
  source_file_dependencies:
    - csrc/quantization/
    - vllm/model_executor/
    - vllm/v1/attention/
    - vllm/compilation/
    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
    # Run all models but only FLASHINFER, Inductor partition and native custom ops
    # include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
    # for ar-rms-quant-fp4, also sweep llama3
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"