[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -604,9 +604,11 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -1181,7 +1183,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1189,33 +1190,16 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1566,7 +1550,10 @@ steps:
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization