[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,196 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
+- label: Sequence Parallel Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/distributed/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Sequence Parallel Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_sequence_parallelism.py
+  - tests/compile/distributed/test_async_tp.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+  - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+  - pytest -v -s tests/compile/distributed/test_async_tp.py
+
+- label: Fusion and Compile Unit Tests (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
+  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
+    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E (2 GPUs)(B200)
-  timeout_in_minutes: 40
+- label: Fusion E2E Quick (H100)
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
-  num_devices: 2
+  device: h100
+  num_devices: 1
  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

+- label: Fusion E2E Config Sweep (H100)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+
+- label: Fusion E2E Config Sweep (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+- label: Fusion E2E TP2 Quick (H100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
+
+- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # for ar-rms-quant-fp4, also sweep llama3
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -16,7 +16,7 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
@@ -47,7 +47,6 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

@@ -133,25 +132,13 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Sequence Parallel Tests (H100)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
 - label: Distributed Tests (2 GPUs)(H100)
+  timeout_in_minutes: 15
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_devices: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -217,45 +204,3 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: Hopper Fusion E2E Tests (H100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -18,7 +18,7 @@ steps:
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +30,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some