[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -537,9 +537,11 @@ steps:
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -1069,7 +1071,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1077,75 +1078,15 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
- label: Hopper Fusion E2E Tests (H100) # 10min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"