2025-12-08 18:25:43 -08:00
group : Compile
depends_on :
- image-build
steps :
2026-02-06 07:19:49 -05:00
- label : Sequence Parallel Correctness Tests (2 GPUs)
2026-02-04 19:09:03 -05:00
timeout_in_minutes : 50
2025-12-08 18:25:43 -08:00
working_dir : "/vllm-workspace/"
2026-02-04 19:09:03 -05:00
num_devices : 2
2025-12-08 18:25:43 -08:00
source_file_dependencies :
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers/
- vllm/compilation/
2025-12-08 18:25:43 -08:00
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
2026-02-06 07:19:49 -05:00
- tests/compile/correctness_e2e/test_sequence_parallel.py
2026-02-04 19:09:03 -05:00
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
2026-02-06 07:19:49 -05:00
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
2026-02-04 19:09:03 -05:00
2026-02-06 07:19:49 -05:00
- label : Sequence Parallel Correctness Tests (2xH100)
2026-02-04 19:09:03 -05:00
timeout_in_minutes : 50
working_dir : "/vllm-workspace/"
device : h100
optional : true
num_devices : 2
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
2026-02-06 07:19:49 -05:00
- pytest -v -s tests/compile/correctness_e2e/test_sequence_parallel.py
- label : AsyncTP Correctness Tests (2xH100)
timeout_in_minutes : 50
working_dir : "/vllm-workspace/"
device : h100
optional : true
num_devices : 2
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/correctness_e2e/test_async_tp.py
2026-02-04 19:09:03 -05:00
- label : Distributed Compile Unit Tests (2xH100)
2026-02-06 07:19:49 -05:00
timeout_in_minutes : 20
2026-02-04 19:09:03 -05:00
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
2025-12-08 18:25:43 -08:00
- vllm/compilation/
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers
2026-02-06 07:19:49 -05:00
- tests/compile/passes/distributed/
2026-02-04 19:09:03 -05:00
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
2026-02-06 07:19:49 -05:00
- pytest -s -v tests/compile/passes/distributed
2026-02-04 19:09:03 -05:00
- label : Fusion and Compile Unit Tests (B200)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : b200
source_file_dependencies :
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
2025-12-08 18:25:43 -08:00
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
2026-02-06 07:19:49 -05:00
- tests/compile/passes/test_fusion_attn.py
- tests/compile/passes/test_silu_mul_quant_fusion.py
- tests/compile/passes/distributed/test_fusion_all_reduce.py
2025-12-08 18:25:43 -08:00
- tests/compile/fullgraph/test_full_graph.py
commands :
2026-02-04 19:09:03 -05:00
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
2025-12-08 18:25:43 -08:00
- nvidia-smi
2026-02-06 07:19:49 -05:00
- pytest -v -s tests/compile/passes/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/passes/test_silu_mul_quant_fusion.py
2026-01-26 12:28:20 -08:00
# this runner has 2 GPUs available even though num_devices=2 is not set
2026-02-06 07:19:49 -05:00
- pytest -v -s tests/compile/passes/distributed/test_fusion_all_reduce.py
2025-12-08 18:25:43 -08:00
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
2026-02-04 19:09:03 -05:00
# TODO(luka) move to H100 once pass tests run on H100
2025-12-08 18:25:43 -08:00
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
2026-02-04 19:09:03 -05:00
- label : Fusion E2E Quick (H100)
timeout_in_minutes : 15
working_dir : "/vllm-workspace/"
device : h100
num_devices : 1
source_file_dependencies :
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label : Fusion E2E Config Sweep (H100)
timeout_in_minutes : 30
working_dir : "/vllm-workspace/"
device : h100
num_devices : 1
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label : Fusion E2E Config Sweep (B200)
timeout_in_minutes : 30
2025-12-08 18:25:43 -08:00
working_dir : "/vllm-workspace/"
2026-01-26 12:28:20 -08:00
device : b200
2026-02-04 19:09:03 -05:00
num_devices : 1
2025-12-08 18:25:43 -08:00
optional : true
2026-02-04 19:09:03 -05:00
commands :
- nvidia-smi
2026-02-09 10:05:14 -05:00
# Run all models but only FLASHINFER, Inductor partition and native custom ops
2026-02-04 19:09:03 -05:00
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
2026-02-09 10:05:14 -05:00
# Run just llama3 (fp8 & fp4) for all config combinations (only inductor partition)
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and (FLASHINFER and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3) or llama-3)"
2026-02-04 19:09:03 -05:00
- label : Fusion E2E TP2 Quick (H100)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : h100
2026-01-26 12:28:20 -08:00
num_devices : 2
2025-12-08 18:25:43 -08:00
source_file_dependencies :
2026-02-04 19:09:03 -05:00
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label : Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes : 40
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
2026-02-09 10:05:14 -05:00
# Run just llama3 (fp8 & bf16) for all config combinations
2026-02-04 19:09:03 -05:00
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label : Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes : 40
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
2025-12-08 18:25:43 -08:00
commands :
- nvidia-smi
2026-02-04 19:09:03 -05:00
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
2025-12-08 18:25:43 -08:00
2026-02-04 19:09:03 -05:00
- label : Fusion E2E TP2 (B200)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : b200
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
2026-02-09 10:05:14 -05:00
# Run all models but only FLASHINFER, Inductor partition and native custom ops
# include qwen with +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
2026-02-04 19:09:03 -05:00
# for ar-rms-quant-fp4, also sweep llama3
2026-02-09 10:05:14 -05:00
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "(FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)) or Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "FLASHINFER and inductor_partition and not +rms_norm and (not +quant_fp8 or +quant_fp8 and qwen3)"