[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com>
Signed-off-by: ProExpertProg <luka.govedic@gmail.com>
Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Luka Govedič
2026-02-04 19:09:03 -05:00
committed by GitHub
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions

View File

@@ -2,56 +2,196 @@ group: Compile
depends_on:
- image-build
steps:
- label: Fusion and Compile Tests (B200)
- label: Sequence Parallel Tests (2 GPUs)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
num_devices: 2
source_file_dependencies:
- vllm/model_executor/layers/
- vllm/compilation/
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- tests/distributed/test_sequence_parallel.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label: Sequence Parallel Tests (2xH100)
timeout_in_minutes: 50
working_dir: "/vllm-workspace/"
device: h100
optional: true
num_devices: 2
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/distributed/test_sequence_parallel.py
- label: Distributed Compile Unit Tests (2xH100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- vllm/compilation/
- vllm/model_executor/layers
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_sequence_parallelism.py
- tests/compile/distributed/test_async_tp.py
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY=1
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
- pytest -v -s tests/compile/distributed/test_async_tp.py
- label: Fusion and Compile Unit Tests (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/v1/worker/
- vllm/v1/cudagraph_dispatcher.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/quantization/
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- vllm/model_executor/layers/attention/attention.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
- tests/compile/test_fusion_attn.py
- tests/compile/test_silu_mul_quant_fusion.py
- tests/compile/distributed/test_fusion_all_reduce.py
- tests/compile/distributed/test_fusions_e2e.py
- tests/compile/fullgraph/test_full_graph.py
commands:
# b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
- nvidia-smi
- pytest -v -s tests/compile/test_fusion_attn.py
- pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
# this runner has 2 GPUs available even though num_devices=2 is not set
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
# Wrap with quotes to escape yaml
- "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
# TODO(luka) move to H100 once pass tests run on H100
- pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile
- label: Fusion E2E (2 GPUs)(B200)
timeout_in_minutes: 40
- label: Fusion E2E Quick (H100)
timeout_in_minutes: 15
working_dir: "/vllm-workspace/"
device: b200
optional: true
num_devices: 2
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/fp4/
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/distributed/test_fusions_e2e.py
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all e2e fusion tests
- pytest -v -s tests/compile/distributed/test_fusions_e2e.py
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
- label: Fusion E2E Config Sweep (H100)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: h100
num_devices: 1
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
- label: Fusion E2E Config Sweep (B200)
timeout_in_minutes: 30
working_dir: "/vllm-workspace/"
device: b200
num_devices: 1
optional: true
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# -k "inductor_partition and not +rms_norm and not +quant_fp8"
# Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
# -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
# Run just llama3 (fp8 & fp4) for all config combinations
# -k "llama-3"
- pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
- label: Fusion E2E TP2 Quick (H100)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp4 & fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
timeout_in_minutes: 40
working_dir: "/vllm-workspace/"
device: h100
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/compilation/
# can affect pattern matching
- vllm/model_executor/layers/layernorm.py
- vllm/model_executor/layers/activation.py
- vllm/model_executor/layers/attention/attention.py
- vllm/model_executor/layers/quantization/input_quant_fp8.py
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run just llama3 (fp8 & bf16) for all config combinations
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
- label: Fusion E2E TP2 (B200)
timeout_in_minutes: 20
working_dir: "/vllm-workspace/"
device: b200
num_devices: 2
source_file_dependencies:
- csrc/quantization/
- vllm/model_executor/
- vllm/v1/attention/
- vllm/compilation/
- tests/compile/fusions_e2e/
commands:
- nvidia-smi
# Run all models and attn backends but only Inductor partition and native custom ops
# for ar-rms-quant-fp4, also sweep llama3
- pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
- pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"