2025-12-08 18:25:43 -08:00
group : Compile
depends_on :
- image-build
steps :
2026-02-04 19:09:03 -05:00
- label : Sequence Parallel Tests (2 GPUs)
timeout_in_minutes : 50
2025-12-08 18:25:43 -08:00
working_dir : "/vllm-workspace/"
2026-02-04 19:09:03 -05:00
num_devices : 2
2025-12-08 18:25:43 -08:00
source_file_dependencies :
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers/
- vllm/compilation/
2025-12-08 18:25:43 -08:00
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
2026-02-04 19:09:03 -05:00
- tests/distributed/test_sequence_parallel.py
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label : Sequence Parallel Tests (2xH100)
timeout_in_minutes : 50
working_dir : "/vllm-workspace/"
device : h100
optional : true
num_devices : 2
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label : Distributed Compile Unit Tests (2xH100)
timeout_in_minutes : 40
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
2025-12-08 18:25:43 -08:00
- vllm/compilation/
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_sequence_parallelism.py
- tests/compile/distributed/test_async_tp.py
commands :
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_async_tp.py
- label : Fusion and Compile Unit Tests (B200)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : b200
source_file_dependencies :
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/
2025-12-08 18:25:43 -08:00
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
2026-02-04 19:09:03 -05:00
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
2025-12-08 18:25:43 -08:00
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/fullgraph/test_full_graph.py
commands :
2026-02-04 19:09:03 -05:00
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
2025-12-08 18:25:43 -08:00
- nvidia-smi
2026-02-04 19:09:03 -05:00
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
2025-12-08 18:25:43 -08:00
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
2026-01-26 12:28:20 -08:00
# this runner has 2 GPUs available even though num_devices=2 is not set
2025-12-08 18:25:43 -08:00
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
2026-02-04 19:09:03 -05:00
# TODO(luka) move to H100 once pass tests run on H100
2025-12-08 18:25:43 -08:00
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
2026-02-04 19:09:03 -05:00
- label : Fusion E2E Quick (H100)
timeout_in_minutes : 15
working_dir : "/vllm-workspace/"
device : h100
num_devices : 1
source_file_dependencies :
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label : Fusion E2E Config Sweep (H100)
timeout_in_minutes : 30
working_dir : "/vllm-workspace/"
device : h100
num_devices : 1
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label : Fusion E2E Config Sweep (B200)
timeout_in_minutes : 30
2025-12-08 18:25:43 -08:00
working_dir : "/vllm-workspace/"
2026-01-26 12:28:20 -08:00
device : b200
2026-02-04 19:09:03 -05:00
num_devices : 1
2025-12-08 18:25:43 -08:00
optional : true
2026-02-04 19:09:03 -05:00
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
# Run just llama3 (fp8 & fp4) for all config combinations
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label : Fusion E2E TP2 Quick (H100)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : h100
2026-01-26 12:28:20 -08:00
num_devices : 2
2025-12-08 18:25:43 -08:00
source_file_dependencies :
2026-02-04 19:09:03 -05:00
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label : Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes : 40
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label : Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes : 40
working_dir : "/vllm-workspace/"
device : h100
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
2025-12-08 18:25:43 -08:00
commands :
- nvidia-smi
2026-02-04 19:09:03 -05:00
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
2025-12-08 18:25:43 -08:00
2026-02-04 19:09:03 -05:00
- label : Fusion E2E TP2 (B200)
timeout_in_minutes : 20
working_dir : "/vllm-workspace/"
device : b200
num_devices : 2
source_file_dependencies :
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands :
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"