58 lines
2.2 KiB
YAML
58 lines
2.2 KiB
YAML
group: Compile
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: Fusion and Compile Tests (B200)
|
|
timeout_in_minutes: 40
|
|
working_dir: "/vllm-workspace/"
|
|
gpu: b200
|
|
source_file_dependencies:
|
|
- csrc/quantization/fp4/
|
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
- vllm/v1/worker/
|
|
- vllm/v1/cudagraph_dispatcher.py
|
|
- vllm/compilation/
|
|
# can affect pattern matching
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
- tests/compile/test_fusion_attn.py
|
|
- tests/compile/test_silu_mul_quant_fusion.py
|
|
- tests/compile/distributed/test_fusion_all_reduce.py
|
|
- tests/compile/distributed/test_fusions_e2e.py
|
|
- tests/compile/fullgraph/test_full_graph.py
|
|
commands:
|
|
- nvidia-smi
|
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
|
# Wrap with quotes to escape yaml
|
|
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
|
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
|
|
|
|
- label: Fusion E2E (2 GPUs)(B200)
|
|
timeout_in_minutes: 40
|
|
working_dir: "/vllm-workspace/"
|
|
gpu: b200
|
|
optional: true
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- csrc/quantization/fp4/
|
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
- vllm/compilation/
|
|
# can affect pattern matching
|
|
- vllm/model_executor/layers/layernorm.py
|
|
- vllm/model_executor/layers/activation.py
|
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
|
- tests/compile/distributed/test_fusions_e2e.py
|
|
commands:
|
|
- nvidia-smi
|
|
# Run all e2e fusion tests
|
|
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
|
|
|