[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -604,9 +604,11 @@ steps:
  - tests/compile
  commands:
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -1181,7 +1183,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1189,33 +1190,16 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+
+    # # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    # # Wrap with quotes to escape yaml
+    # - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
@@ -1566,7 +1550,10 @@ steps:
    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
    #- pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
-    - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # - "VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - HIP_VISIBLE_DEVICES=0,1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=allgather_reducescatter --disable-nccl-for-dp-synchronization
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -537,9 +537,11 @@ steps:
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # # Limit to no custom ops to reduce running time
+    # # Wrap with quotes to escape yaml and avoid starting -k string with a -
+    # - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.

 - label: Cudagraph test
  timeout_in_minutes: 20
@@ -1069,7 +1071,6 @@ steps:
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
    - nvidia-smi
@@ -1077,75 +1078,15 @@ steps:
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_gpus=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    #  # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
+    #  # Wrap with quotes to escape yaml
+    #  - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
+    # Old E2E tests were removed in https://github.com/vllm-project/vllm/pull/33293
+    # in favor of new tests in fusions_e2e. We avoid replicating the new jobs in this file as it's deprecated.
+
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Blackwell Fusion E2E Tests # 30 min
-  timeout_in_minutes: 40
-  working_dir: "/vllm-workspace/"
-  gpu: b200
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
-
- label: Hopper Fusion E2E Tests (H100) # 10min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)  # 70min
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  gpu: h100
-  optional: true
-  num_gpus: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-
 - label: Blackwell GPT-OSS Eval
  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/"
--- a/.buildkite/test_areas/compile.yaml
+++ b/.buildkite/test_areas/compile.yaml
@@ -2,56 +2,196 @@ group: Compile
 depends_on: 
  - image-build
 steps:
- label: Fusion and Compile Tests (B200)
+- label: Sequence Parallel Tests (2 GPUs)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/model_executor/layers/
+  - vllm/compilation/
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
+  - tests/distributed/test_sequence_parallel.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Sequence Parallel Tests (2xH100)
+  timeout_in_minutes: 50
+  working_dir: "/vllm-workspace/"
+  device: h100
+  optional: true
+  num_devices: 2
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/distributed/test_sequence_parallel.py
+
+- label: Distributed Compile Unit Tests (2xH100)
  timeout_in_minutes: 40
  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+  - vllm/compilation/
+  - vllm/model_executor/layers
+  - tests/compile/distributed/test_fusion_all_reduce.py
+  - tests/compile/distributed/test_sequence_parallelism.py
+  - tests/compile/distributed/test_async_tp.py
+  commands:
+  - export VLLM_TEST_CLEAN_GPU_MEMORY=1
+  - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
+  - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
+  - pytest -v -s tests/compile/distributed/test_async_tp.py
+
+- label: Fusion and Compile Unit Tests (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
  device: b200
  source_file_dependencies:
  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/v1/worker/
-  - vllm/v1/cudagraph_dispatcher.py
-  - vllm/compilation/
-  # can affect pattern matching
+  - vllm/model_executor/layers/quantization/
  - vllm/model_executor/layers/layernorm.py
  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
+  - vllm/model_executor/layers/attention/attention.py
+  - vllm/v1/attention/backends/flashinfer.py
+  - vllm/compilation/ # TODO(luka) limit to vllm/compilation/passes
  - tests/compile/test_fusion_attn.py
  - tests/compile/test_silu_mul_quant_fusion.py
  - tests/compile/distributed/test_fusion_all_reduce.py
-  - tests/compile/distributed/test_fusions_e2e.py
  - tests/compile/fullgraph/test_full_graph.py
  commands:
+    # b200 runners are limited, so we limit the tests to the minimum set only supported on Blackwell
    - nvidia-smi
-    - pytest -v -s tests/compile/test_fusion_attn.py
+    - pytest -v -s tests/compile/test_fusion_attn.py -k FLASHINFER
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
    # this runner has 2 GPUs available even though num_devices=2 is not set
    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
-    # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
-    # Wrap with quotes to escape yaml
-    - "pytest -v -s tests/compile/distributed/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
    # test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
+    # TODO(luka) move to H100 once pass tests run on H100
    - pytest -v -s tests/compile/fullgraph/test_full_graph.py::test_fp8_kv_scale_compile

- label: Fusion E2E (2 GPUs)(B200)
-  timeout_in_minutes: 40
+- label: Fusion E2E Quick (H100)
+  timeout_in_minutes: 15
  working_dir: "/vllm-workspace/"
-  device: b200
-  optional: true
-  num_devices: 2
+  device: h100
+  num_devices: 1
  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
  commands:
    - nvidia-smi
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"

+- label: Fusion E2E Config Sweep (H100)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 1
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "llama-3"
+
+- label: Fusion E2E Config Sweep (B200)
+  timeout_in_minutes: 30
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 1
+  optional: true
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    # Qwen requires +quant_fp8 as -quant_fp8 rms+quant fusion is not supported
+    # -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3"
+    # Run just llama3 (fp8 & fp4) for all config combinations
+    # -k "llama-3"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp1_quant.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "inductor_partition and not +rms_norm and +quant_fp8 and qwen3" -k "llama-3"
+
+- label: Fusion E2E TP2 Quick (H100)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
+
+- label: Fusion E2E TP2 AR-RMS Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp4 & fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "llama-3"
+
+- label: Fusion E2E TP2 AsyncTP Config Sweep (H100)
+  timeout_in_minutes: 40
+  working_dir: "/vllm-workspace/"
+  device: h100
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/compilation/
+    # can affect pattern matching
+    - vllm/model_executor/layers/layernorm.py
+    - vllm/model_executor/layers/activation.py
+    - vllm/model_executor/layers/attention/attention.py
+    - vllm/model_executor/layers/quantization/input_quant_fp8.py
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run just llama3 (fp8 & bf16) for all config combinations
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "llama-3"
+
+- label: Fusion E2E TP2 (B200)
+  timeout_in_minutes: 20
+  working_dir: "/vllm-workspace/"
+  device: b200
+  num_devices: 2
+  source_file_dependencies:
+    - csrc/quantization/
+    - vllm/model_executor/
+    - vllm/v1/attention/
+    - vllm/compilation/
+    - tests/compile/fusions_e2e/
+  commands:
+    - nvidia-smi
+    # Run all models and attn backends but only Inductor partition and native custom ops
+    # for ar-rms-quant-fp4, also sweep llama3
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_ar_rms.py -k "inductor_partition and not +rms_norm and not +quant_fp8" -k "Llama-3.1-8B-Instruct-FP4"
+    - pytest -v -s tests/compile/fusions_e2e/test_tp2_async_tp.py -k "inductor_partition and not +rms_norm and not +quant_fp8"
--- a/.buildkite/test_areas/distributed.yaml
+++ b/.buildkite/test_areas/distributed.yaml
@@ -16,7 +16,7 @@ steps:
  - pytest -v -s distributed/test_shm_storage.py

 - label: Distributed (2 GPUs)
-  timeout_in_minutes: 90
+  timeout_in_minutes: 60
  working_dir: "/vllm-workspace/tests"
  num_devices: 2
  source_file_dependencies:
@@ -47,7 +47,6 @@ steps:
  - pytest -v -s ./compile/test_wrapper.py
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-  - pytest -v -s distributed/test_sequence_parallel.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
  - pytest -v -s v1/worker/test_worker_memory_snapshot.py

@@ -133,25 +132,13 @@ steps:
  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
  - pytest -v -s -x lora/test_mixtral.py

- label: Sequence Parallel Tests (H100)
-  timeout_in_minutes: 60
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run sequence parallel tests
-    - pytest -v -s tests/distributed/test_sequence_parallel.py
-    - pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
-
 - label: Distributed Tests (2 GPUs)(H100)
+  timeout_in_minutes: 15
  device: h100
  optional: true
  working_dir: "/vllm-workspace/"
  num_devices: 2
  commands:
-    - VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
    - pytest -v -s tests/distributed/test_context_parallel.py
    - VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput
    - pytest -v -s tests/v1/distributed/test_dbo.py
@@ -217,45 +204,3 @@ steps:
  commands:
  - pytest -v -s distributed/test_pp_cudagraph.py
  - pytest -v -s distributed/test_pipeline_parallel.py
-
- label: Hopper Fusion E2E Tests (H100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/test_fusion_attn.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # skip Llama-4 since it does not fit on this device
-    - pytest -v -s tests/compile/test_fusion_attn.py -k 'not Llama-4'
-
- label: Hopper Fusion Distributed E2E Tests (2xH100)
-  timeout_in_minutes: 70
-  working_dir: "/vllm-workspace/"
-  device: h100
-  optional: true
-  num_devices: 2
-  source_file_dependencies:
-  - csrc/quantization/fp4/
-  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
-  - vllm/v1/attention/backends/flashinfer.py
-  - vllm/compilation/
-  # can affect pattern matching
-  - vllm/model_executor/layers/layernorm.py
-  - vllm/model_executor/layers/activation.py
-  - vllm/model_executor/layers/quantization/input_quant_fp8.py
-  - tests/compile/distributed/test_fusions_e2e.py
-  commands:
-    - export VLLM_TEST_CLEAN_GPU_MEMORY=1
-    # Run all e2e fusion tests
-    - pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
-    - pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
--- a/.buildkite/test_areas/pytorch.yaml
+++ b/.buildkite/test_areas/pytorch.yaml
@@ -18,7 +18,7 @@ steps:
  - "find compile/ -maxdepth 1 -name 'test_*.py' -print0 | xargs -0 -n1 -I{} pytest -s -v '{}'"

 - label: PyTorch Fullgraph Smoke Test
-  timeout_in_minutes: 30
+  timeout_in_minutes: 35
  source_file_dependencies:
  - vllm/
  - tests/compile
@@ -30,16 +30,13 @@ steps:
  - "find compile/fullgraph/ -name 'test_*.py' -not -name 'test_full_graph.py' -exec pytest -s -v {} \\;"

 - label: PyTorch Fullgraph
-  timeout_in_minutes: 40
+  timeout_in_minutes: 30
  source_file_dependencies:
  - vllm/
  - tests/compile
  commands:
    # fp8 kv scales not supported on sm89, tested on Blackwell instead
  - pytest -v -s compile/fullgraph/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
-    # Limit to no custom ops to reduce running time
-    # Wrap with quotes to escape yaml and avoid starting -k string with a -
-  - "pytest -v -s compile/distributed/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"

 - label: Pytorch Nightly Dependency Override Check # 2min
  # if this test fails, it means the nightly torch version is not compatible with some
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -1,321 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from __future__ import annotations
-
-import logging
-from typing import Any
-
-import pytest
-import regex as re
-
-from tests.compile.fusion_test_utils import (
-    CUSTOM_OPS_FP8,
-    CUSTOM_OPS_QUANT_RMS_NORM,
-    CUSTOM_OPS_RMS_NORM,
-    MODELS,
-    MODELS_FP4,
-    MODELS_FP8,
-    MODELS_GROUP_FP8,
-    Matches,
-    custom_ops_product,
-    is_blackwell,
-    run_model,
-)
-from tests.v1.attention.utils import AttentionBackendEnum
-from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
-from vllm.platforms import current_platform
-from vllm.utils.flashinfer import has_flashinfer
-from vllm.utils.torch_utils import is_torch_equal_or_newer
-
-from ...utils import flat_product, multi_gpu_test
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Toggle RMSNorm and QuantFP8 for FP8 models
-    list(
-        flat_product(
-            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
-        )
-    )
-    # Toggle RMSNorm for FP4 models and unquant models
-    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_cuda()
-    or not has_flashinfer()
-    or not current_platform.has_device_capability(90),
-    reason="allreduce+rmsnorm fusion requires flashinfer",
-)
-def test_tp2_attn_quant_allreduce_rmsnorm(
-    model_name: str,
-    model_kwargs: dict,
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    if "fp4" in model_name.lower() and not is_blackwell():
-        pytest.skip("NVFP4 quant requires Blackwell")
-
-    if backend == AttentionBackendEnum.FLASHINFER and not is_blackwell():
-        # FlashInfer attn fusion requires Blackwell
-        matches = matches._replace(attention_fusion=0)
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        custom_ops=custom_ops_list,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_attn_quant=True,
-            eliminate_noops=True,
-            fuse_allreduce_rms=True,
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(
-            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
-        )
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    # 2 for each compile range
-    # (global compile range can be split due to fuse_allreduce_rmsnorm)
-    num_compile_ranges = len(compilation_config.get_compile_ranges())
-    assert num_compile_ranges in [1, 2]
-
-    assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
-
-    assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)
-
-    log_matches = re.findall(
-        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.allreduce_fusion
-    assert int(log_matches[1]) == matches.allreduce_fusion
-
-    log_matches = re.findall(
-        r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
-
-
-@multi_gpu_test(num_gpus=2)
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Toggle RMSNorm and QuantFP8 for FP8 models
-    list(
-        flat_product(
-            MODELS_FP8, custom_ops_product(CUSTOM_OPS_FP8, CUSTOM_OPS_RMS_NORM)
-        )
-    )
-    # Toggle RMSNorm for FP4 models and unquant models
-    + list(flat_product(MODELS_FP4 + MODELS, CUSTOM_OPS_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-@pytest.mark.skipif(
-    not current_platform.is_cuda(),
-    reason="sequence parallel only tested on CUDA",
-)
-def test_tp2_attn_quant_async_tp(
-    model_name: str,
-    model_kwargs: dict,
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if is_blackwell():
-        # TODO: https://github.com/vllm-project/vllm/issues/27893
-        pytest.skip("Blackwell is not supported for AsyncTP pass")
-
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    if "fp4" in model_name.lower() and not is_blackwell():
-        pytest.skip("NVFP4 quant requires Blackwell")
-
-    if backend == AttentionBackendEnum.FLASHINFER:
-        if not has_flashinfer():
-            pytest.skip("FlashInfer backend requires flashinfer installed")
-        if not is_blackwell():
-            # FlashInfer attn fusion requires Blackwell
-            matches = matches._replace(attention_fusion=0)
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        custom_ops=custom_ops_list,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_attn_quant=True,
-            eliminate_noops=True,
-            enable_sp=True,
-            fuse_gemm_comms=True,
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(
-            compilation_config, model_name, tensor_parallel_size=2, **model_kwargs
-        )
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.attention_fusion
-    assert int(log_matches[1]) == matches.attention_fusion
-
-    log_matches = re.findall(
-        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.sequence_parallel
-    assert int(log_matches[1]) == matches.sequence_parallel
-
-    log_matches = re.findall(
-        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 2, log_holder.text
-
-    assert int(log_matches[0]) == matches.async_tp
-    assert int(log_matches[1]) == matches.async_tp
-
-
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Test rms norm+group quant_fp8 fusion
-    list[tuple[Any, ...]](flat_product(MODELS_GROUP_FP8, CUSTOM_OPS_QUANT_RMS_NORM)),
-)
-@pytest.mark.parametrize("inductor_graph_partition", [True, False])
-# TODO: remove skip after we fix the fusion thoroughly
-@pytest.mark.skipif(is_blackwell(), reason="Temporarily disabled on Blackwell")
-def test_rms_group_quant(
-    model_name: str,
-    model_kwargs: dict[str, Any],
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-
-    # TODO: remove this after fusion is fixed
-    monkeypatch.setenv("VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES", "0")
-
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        custom_ops=custom_ops_list,
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(
-            fuse_norm_quant=True, fuse_act_quant=True, eliminate_noops=True
-        ),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(compilation_config, model_name, **model_kwargs)
-
-    log_matches = re.findall(
-        r"\[fusion.py:\d+] Replaced (\d+) patterns",
-        log_holder.text,
-    )
-    assert len(log_matches) == 1, log_holder.text
-    assert int(log_matches[0]) == matches.rms_quant_norm_fusion
--- a/tests/compile/fusion_test_utils.py
+++ b/tests/compile/fusion_test_utils.py
@@ -1,208 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Shared utilities for fusion tests (e.g. test_fusion_attn.py)."""
-
-from __future__ import annotations
-
-import itertools
-from collections.abc import Iterable
-from typing import Any, NamedTuple
-
-from tests.v1.attention.utils import AttentionBackendEnum
-from vllm import LLM, SamplingParams
-from vllm.config import CompilationConfig, CUDAGraphMode
-from vllm.platforms import current_platform
-
-is_blackwell = lambda: current_platform.is_device_capability_family(100)
-"""Are we running on Blackwell, a lot of tests depend on it"""
-
-
-def has_cuda_graph_wrapper_metadata() -> bool:
-    from importlib import import_module
-
-    try:
-        module = import_module("torch._inductor.utils")
-        module.CUDAGraphWrapperMetadata  # noqa B018
-    except AttributeError:
-        return False
-    return True
-
-
-class Matches(NamedTuple):
-    attention_fusion: int = 0
-    allreduce_fusion: int = 0
-    sequence_parallel: int = 0
-    async_tp: int = 0
-    rms_quant_norm_fusion: int = 0
-
-
-class ModelBackendTestCase(NamedTuple):
-    model_name: str
-    model_kwargs: dict[str, Any]
-    backend: AttentionBackendEnum
-    matches: Matches
-
-
-# E2E model test cases
-MODELS_FP8: list[ModelBackendTestCase] = []
-MODELS_FP4: list[ModelBackendTestCase] = []
-MODELS: list[ModelBackendTestCase] = []  # tp-only (unquantized)
-MODELS_GROUP_FP8: list[ModelBackendTestCase] = []
-
-if current_platform.is_cuda():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            # Use smaller model for L40s in CI
-            model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
-            # https://github.com/vllm-project/vllm/issues/28568
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=48,
-                allreduce_fusion=96,
-                sequence_parallel=96,
-                async_tp=95,  # mlp is moe, no fusion there
-            ),
-        ),
-    ]
-
-    MODELS_FP4 = [
-        ModelBackendTestCase(
-            model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER,
-            matches=Matches(
-                attention_fusion=32,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-    ]
-
-    # TP only (unquantized models)
-    MODELS = [
-        ModelBackendTestCase(
-            model_name="meta-llama/Llama-3.1-8B-Instruct",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=65,
-                sequence_parallel=65,
-                async_tp=128,
-            ),
-        ),
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                attention_fusion=0,
-                allreduce_fusion=97,
-                sequence_parallel=97,
-                async_tp=96,  # MLP is MoE, half the fusions of dense
-            ),
-        ),
-    ]
-
-    MODELS_GROUP_FP8 = [
-        ModelBackendTestCase(
-            model_name="Qwen/Qwen3-30B-A3B-FP8",
-            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(
-                rms_quant_norm_fusion=48,
-            ),
-        ),
-    ]
-
-elif current_platform.is_rocm():
-    MODELS_FP8 = [
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.TRITON_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-        ModelBackendTestCase(
-            model_name="amd/Llama-3.1-8B-Instruct-FP8-KV",
-            model_kwargs=dict(max_model_len=1024),
-            backend=AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN,
-            matches=Matches(attention_fusion=32),
-        ),
-    ]
-
-
-# Custom ops toggle lists for parametrization
-CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"]
-CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"]
-CUSTOM_OPS_QUANT_RMS_NORM = ["+quant_fp8,+rms_norm"]
-
-
-def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]:
-    """Generate all combinations of custom ops for parametrization."""
-    for op_list in itertools.product(*custom_ops_lists):
-        yield ",".join(op_list)
-
-
-def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
-    """Run a model with the given compilation config for E2E fusion tests."""
-    compilation_config = (
-        compile_config
-        if isinstance(compile_config, CompilationConfig)
-        else CompilationConfig(mode=compile_config)
-    )
-
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0)
-    # Allow override from model_kwargs
-    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
-    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
-
-    # No cudagraphs by default
-    if compilation_config.cudagraph_mode is None:
-        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-    llm = LLM(
-        model=model,
-        compilation_config=compilation_config,
-        **model_kwargs,
-    )
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-    # Get the compile ranges split points after vllm config post init
-    # in order to compute compile ranges correctly
-    compilation_config.compile_ranges_split_points = (
-        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
-    )
--- a/tests/compile/fusions_e2e/init.py
+++ b/tests/compile/fusions_e2e/init.py
--- a/tests/compile/fusions_e2e/common.py
+++ b/tests/compile/fusions_e2e/common.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from collections.abc import Callable, Iterable
+from typing import Any, NamedTuple
+
+import pytest
+import regex as re
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+
+class Matches(NamedTuple):
+    # simple pointwise
+    rms_quant_fusion: int = 0
+    act_quant_fusion: int = 0
+    norm_rope_fusion: int = 0
+    attn_quant_fusion: int = 0
+    # distributed
+    ar_rms_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
+
+
+class ModelFusionInfo(NamedTuple):
+    model_name: str
+    matches: Callable[[int], Matches]
+    """Given number of hidden layers, produces the matches object"""
+    model_kwargs: dict[str, Any] = {}
+    hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n}
+
+
+class AttentionBackendCase(NamedTuple):
+    backend: AttentionBackendEnum
+    model_kwargs: dict[str, Any] = {}
+    """Additional args required for attn+quant fusion"""
+
+
+is_blackwell = lambda: current_platform.is_device_capability_family(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+def custom_ops_combos(*custom_ops: str) -> Iterable[str]:
+    """Generate all combinations of custom ops for parametrization."""
+    custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops]
+    for op_list in itertools.product(*custom_ops_lists):
+        yield ",".join(op_list)
+
+
+# Quick inline validation
+assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"]
+assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [
+    "-quant_fp8,-rms_norm",
+    "-quant_fp8,+rms_norm",
+    "+quant_fp8,-rms_norm",
+    "+quant_fp8,+rms_norm",
+]
+
+
+def has_cuda_graph_wrapper_metadata() -> bool:
+    from importlib import import_module
+
+    try:
+        module = import_module("torch._inductor.utils")
+        module.CUDAGraphWrapperMetadata  # noqa B018
+    except AttributeError:
+        return False
+    return True
+
+
+INDUCTOR_GRAPH_PARTITION = [
+    pytest.param(
+        True,
+        marks=pytest.mark.skipif(
+            not has_cuda_graph_wrapper_metadata(),
+            reason="torch version does not support Inductor partition",
+        ),
+        id="inductor_partition",
+    ),
+    pytest.param(False, id="dynamo_partition"),
+]
+
+FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
+    "rms_quant_fusion": re.compile(
+        r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns"
+    ),
+    "act_quant_fusion": re.compile(
+        r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns"
+    ),
+    "norm_rope_fusion": re.compile(
+        r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
+    ),
+    "attn_quant_fusion": re.compile(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes"
+    ),
+    "ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
+    "sequence_parallel": re.compile(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
+    ),
+    "async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
+}
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+
+import pytest
+import regex as re
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+from .common import FUSION_LOG_PATTERNS, AttentionBackendCase, Matches
+
+
+def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
+    """Run a model with the given compilation config for E2E fusion tests."""
+    compilation_config = (
+        compile_config
+        if isinstance(compile_config, CompilationConfig)
+        else CompilationConfig(mode=compile_config)
+    )
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    # Allow override from model_kwargs
+    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
+    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
+
+    # No cudagraphs by default
+    if compilation_config.cudagraph_mode is None:
+        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+    llm = LLM(
+        model=model,
+        compilation_config=compilation_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Get the compile ranges split points after vllm config post init
+    # in order to compute compile ranges correctly
+    compilation_config.compile_ranges_split_points = (
+        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
+    )
+
+
+@pytest.fixture
+def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
+    def run(
+        model_name: str,
+        matches: Matches,
+        model_kwargs: dict,
+        attn_backend: AttentionBackendCase,
+        compilation_config: dict,
+        matches_check: list[str],
+        use_deepgemm: bool = False,
+        tp_size: int = 1,
+    ):
+        monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
+
+        # Disable, compile cache to make sure custom passes run.
+        # Otherwise, we can't verify fusion happened through the logs.
+        monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+        # To capture subprocess logs, we need to know whether spawn or fork is used.
+        # Force spawn as it is more general.
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+        model_kwargs = {**attn_backend.model_kwargs, **model_kwargs}
+        model_kwargs["attention_config"] = {"backend": attn_backend.backend.name}
+        model_kwargs["tensor_parallel_size"] = tp_size
+
+        # Always compile the full graph instead of piecewise
+        if not compilation_config["use_inductor_graph_partition"]:
+            compilation_config["splitting_ops"] = []
+
+        full_compilation_config = CompilationConfig(
+            cudagraph_mode=CUDAGraphMode.NONE,
+            mode=CompilationMode.VLLM_COMPILE,
+            inductor_compile_config={"force_disable_caches": True},
+            **compilation_config,
+        )
+
+        with caplog_mp_spawn(logging.DEBUG) as log_holder:
+            run_model(full_compilation_config, model_name, **model_kwargs)
+
+        num_compile_ranges = len(full_compilation_config.get_compile_ranges())
+        assert num_compile_ranges in [1, 2]
+
+        print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}")
+        print("Fusion results:")
+
+        # Iterate through all so printing happens before asserting
+        log_matches_dict = {}
+        for match_name, pattern in FUSION_LOG_PATTERNS.items():
+            log_matches_dict[match_name] = list(pattern.findall(log_holder.text))
+            print(f"- {match_name}={','.join(log_matches_dict[match_name])}")
+
+        # Now check the matches
+        for match_name in matches_check:
+            num_ranges_activated = (
+                1 if match_name == "ar_rms_fusion" else num_compile_ranges
+            )
+            n_expected = tp_size * num_ranges_activated
+
+            log_matches = list(int(ms) for ms in log_matches_dict[match_name])
+            assert len(log_matches) == n_expected, (
+                f"Could not find {n_expected} {match_name} "
+                f"(found {len(log_matches)}) in:\n {log_holder.text}"
+            )
+
+            expected_matches = getattr(matches, match_name)
+
+            if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check:
+                # AR+rms+quant takes precedence over rms+quant if activated.
+                # That means we get full matching where ar+rms+quant was not activated,
+                # and less where it was
+                assert sum(m == expected_matches for m in log_matches) == tp_size * (
+                    num_ranges_activated - 1
+                ), "Expecting full rms+quant fusion where ar+rms+quant not activated"
+
+                assert all(
+                    expected_matches - matches.ar_rms_fusion <= m <= expected_matches
+                    for m in log_matches
+                ), (
+                    f"Expecting at least {expected_matches - matches.ar_rms_fusion} "
+                    f"where ar+rms+quant was activated"
+                )
+            else:
+                expected_matches_list = [expected_matches] * n_expected
+                assert sorted(log_matches) == expected_matches_list, (
+                    f"{match_name} expected: {expected_matches_list}, "
+                    f"found: {sorted(log_matches)}"
+                )
+
+            if match_name == "ar_rms_fusion":
+                log_matches = re.findall(
+                    r"pass_manager.py:\d+] Skipping "
+                    r".*AllReduceFusionPass.* with compile range",
+                    log_holder.text,
+                )
+
+                n_expected = tp_size * (num_compile_ranges - num_ranges_activated)
+                assert len(log_matches) == n_expected, (
+                    f'Could not find {n_expected} "Skipping AllReduceFusionPass" '
+                    f"(found {len(log_matches)}) in:\n {log_holder.text}"
+                )
+
+    return run
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+from .common import AttentionBackendCase, Matches, ModelFusionInfo, is_blackwell
+
+# Attn backends
+FLASHINFER_ATTN = pytest.param(
+    AttentionBackendCase(
+        backend=AttentionBackendEnum.FLASHINFER,
+        model_kwargs=dict(kv_cache_dtype="fp8"),
+    ),
+    id="FLASHINFER",
+    marks=pytest.mark.skipif(
+        not is_blackwell() or not has_flashinfer(),
+        reason="FI backend requires Blackwell and FlashInfer",
+    ),
+)
+
+TRITON_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
+)
+
+# Models
+llama3_8b = ModelFusionInfo(
+    model_name="meta-llama/Llama-3.1-8B-Instruct",
+    matches=lambda n_layers: Matches(
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+llama3_8b_fp8 = ModelFusionInfo(
+    model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers * 2,
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+llama3_8b_fp4 = ModelFusionInfo(
+    model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=0,
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+# MoEs cannot do act+quant fusion because those ops are hidden from torch.compile.
+# MoEs also only expose 1 rms+quant fusion because the quant for up_proj is hidden.
+# TODO(luka): https://github.com/vllm-project/vllm/issues/31985
+# Also, for MoEs, gemm+collective fusion only happens for dense GEMMs (o_proj/qkv proj)
+
+llama4_scout_fp8 = ModelFusionInfo(
+    model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+    hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
+    ),
+)
+
+llama4_scout_fp4 = ModelFusionInfo(
+    model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
+    hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=0,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
+    ),
+)
+
+qwen3_a3b = ModelFusionInfo(
+    model_name="Qwen/Qwen3-30B-A3B",
+    matches=lambda n_layers: Matches(
+        norm_rope_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)
+
+qwen3_a3b_fp8 = ModelFusionInfo(
+    model_name="Qwen/Qwen3-30B-A3B-FP8",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        # TODO broken on Blackwell:
+        # https://github.com/vllm-project/vllm/issues/33295
+        norm_rope_fusion=0 if is_blackwell() else n_layers,
+        attn_quant_fusion=0,  # attn + group quant not supported
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b_fp4,
+    llama3_8b_fp8,
+    llama4_scout_fp4,
+    llama4_scout_fp8,
+    qwen3_a3b_fp8,
+)
+
+
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
+    [
+        (*llama3_8b_fp8, False),
+        (*llama4_scout_fp8, False),
+        (*qwen3_a3b_fp8, False),
+        (*qwen3_a3b_fp8, True),
+    ],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [6])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp1_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    use_deepgemm: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    if use_deepgemm:
+        # TODO(luka/eliza) DeepGEMM uses different quants, matching not supported
+        #  - on Blackwell, uses a special quant fp8, currently not supported
+        #  - on Hopper, tma-aligned scales inhibit matching (fix WIP)
+        pytest.skip("DeepGEMM & quant matching not currently supported")
+
+    matches = matches_fn(n_layers)
+
+    if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
+        # This is why config forces +quant_fp8 by default
+        pytest.skip("native QuantFP8 matching not supported for group quant")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        use_deepgemm=use_deepgemm,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4, llama4_scout_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [6])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+def test_tp1_fp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+        ),
+    )
+
+    matches_check = ["act_quant_fusion", "attn_quant_fusion", "norm_rope_fusion"]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+    )
--- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py
+++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from ...utils import multi_gpu_test
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b,
+    llama3_8b_fp4,
+    llama3_8b_fp8,
+    llama4_scout_fp4,
+    llama4_scout_fp8,
+    qwen3_a3b,
+    qwen3_a3b_fp8,
+)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    # qwen3-fp8 should still fuse AR+rms even though group quant is not yet supported
+    [llama3_8b_fp8, llama4_scout_fp8, qwen3_a3b_fp8],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_ar_rms_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
+        # This is why config forces +quant_fp8 by default
+        pytest.skip("native QuantFP8 matching not supported for group quant")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4, llama4_scout_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+def test_tp2_ar_rms_fp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "act_quant_fusion",
+        "attn_quant_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b, qwen3_a3b],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_ar_rms_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            enable_qk_norm_rope_fusion=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "norm_rope_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from ...utils import multi_gpu_test
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b,
+    llama3_8b_fp8,
+    llama4_scout_fp8,
+    qwen3_a3b,
+)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp8, llama4_scout_fp8],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_async_tp_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    if is_blackwell():
+        # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
+        monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b, qwen3_a3b],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_async_tp_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+        ),
+    )
+
+    matches_check = [
+        "norm_rope_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
--- a/tests/compile/test_fusion_attn.py
+++ b/tests/compile/test_fusion_attn.py
@@ -1,23 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import copy
-import logging
-from typing import Any

 import pytest
-import regex as re
 import torch._dynamo

 from tests.compile.backend import LazyInitPass, TestBackend
-from tests.compile.fusion_test_utils import (
-    CUSTOM_OPS_FP8,
-    MODELS_FP4,
-    MODELS_FP8,
-    Matches,
-    has_cuda_graph_wrapper_metadata,
-    is_blackwell,
-    run_model,
-)
 from tests.utils import flat_product
 from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata
 from vllm._custom_ops import cutlass_scaled_fp4_mm, scaled_fp4_quant
@@ -31,7 +19,6 @@ from vllm.config import (
    CacheConfig,
    CompilationConfig,
    CompilationMode,
-    CUDAGraphMode,
    ModelConfig,
    PassConfig,
    SchedulerConfig,
@@ -47,7 +34,6 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
 )
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
-from vllm.utils.torch_utils import is_torch_equal_or_newer
 from vllm.v1.attention.backend import AttentionMetadata
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
 from vllm.v1.kv_cache_interface import AttentionSpec
@@ -501,88 +487,3 @@ def test_attention_quant_pattern(

    # Check that results are close
    torch.testing.assert_close(result_unfused, result_fused, atol=1e-2, rtol=1e-2)
-
-
-@pytest.mark.parametrize(
-    "model_name, model_kwargs, backend, matches, custom_ops",
-    # Test attention+quant_fp8 fusion with custom and torch impls of QuantFP8
-    list(flat_product(MODELS_FP8, CUSTOM_OPS_FP8))
-    # quant_fp4 only has the custom impl
-    + list(flat_product(MODELS_FP4, [""])),
-)
-@pytest.mark.parametrize(
-    "inductor_graph_partition",
-    [
-        pytest.param(
-            True,
-            marks=pytest.mark.skipif(
-                not has_cuda_graph_wrapper_metadata(),
-                reason="This test requires"
-                "torch._inductor.utils.CUDAGraphWrapperMetadata to run",
-            ),
-        ),
-        False,
-    ],
-)
-def test_attn_quant(
-    model_name: str,
-    model_kwargs: dict[str, Any],
-    backend: AttentionBackendEnum,
-    matches: Matches,
-    custom_ops: str,
-    inductor_graph_partition: bool,
-    caplog_mp_spawn,
-    monkeypatch,
-):
-    if not current_platform.has_device_capability(90):
-        pytest.skip("test_attn_quant requires H100 (SM90) or B200 (SM100) GPU")
-    if backend == AttentionBackendEnum.FLASHINFER and (
-        not is_blackwell() or not has_flashinfer()
-    ):
-        pytest.skip("FlashInfer attn fusion requires Blackwell and flashinfer")
-    if inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
-        pytest.skip("Inductor graph partition requires torch>=2.9")
-
-    custom_ops_list = custom_ops.split(",") if custom_ops else []
-
-    if inductor_graph_partition:
-        mode = CUDAGraphMode.FULL_AND_PIECEWISE
-        splitting_ops: list[str] | None = None
-    else:
-        # FIXME: Llama-4-Scout-17B-16E-Instruct-FP8 + FlashInfer + Blackwell end at
-        # CUDAGraphMode.NONE here because it derives an attention backend that
-        # does not support full cudagraphs
-        mode = CUDAGraphMode.FULL_DECODE_ONLY
-        splitting_ops = []
-
-    # Disable, compile cache to make sure custom passes run.
-    # Otherwise, we can't verify fusion happened through the logs.
-    monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
-
-    # To capture subprocess logs, we need to know whether spawn or fork is used.
-    # Force spawn as it is more general.
-    monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
-    model_kwargs["attention_config"] = {"backend": backend.name}
-
-    compilation_config = CompilationConfig(
-        # Testing properties
-        custom_ops=custom_ops_list,
-        use_inductor_graph_partition=inductor_graph_partition,
-        cudagraph_mode=mode,
-        splitting_ops=splitting_ops,
-        # Common
-        mode=CompilationMode.VLLM_COMPILE,
-        pass_config=PassConfig(fuse_attn_quant=True, eliminate_noops=True),
-        # Inductor caches custom passes by default as well via uuid
-        inductor_compile_config={"force_disable_caches": True},
-    )
-
-    with caplog_mp_spawn(logging.DEBUG) as log_holder:
-        run_model(compilation_config, model_name, **model_kwargs)
-
-    log_matches = re.findall(
-        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
-        log_holder.text,
-    )
-    assert len(log_matches) == 1, log_holder.text
-    assert int(log_matches[0]) == matches.attention_fusion
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1002,7 +1002,7 @@ def test_vllm_config_explicit_overrides():
    assert config.compilation_config.pass_config.fuse_attn_quant is True

    # Explicit cudagraph mode override on quantized model at O2
-    pass_config = PassConfig(fuse_gemm_comms=True)
+    pass_config = PassConfig(enable_qk_norm_rope_fusion=True)
    compilation_config = CompilationConfig(
        cudagraph_mode=CUDAGraphMode.NONE, pass_config=pass_config
    )
@@ -1012,7 +1012,7 @@ def test_vllm_config_explicit_overrides():
        compilation_config=compilation_config,
    )
    assert config.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
-    assert config.compilation_config.pass_config.fuse_gemm_comms is True
+    assert config.compilation_config.pass_config.enable_qk_norm_rope_fusion is True
    # Mode should still use default for O2
    assert config.compilation_config.mode == CompilationMode.VLLM_COMPILE

--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -766,7 +766,12 @@ class VllmConfig:
        if self.compilation_config.pass_config.fuse_gemm_comms:
            self.compilation_config.pass_config.enable_sp = True
        if self.compilation_config.pass_config.enable_sp:
-            if "-rms_norm" in self.compilation_config.custom_ops:
+            if self.parallel_config.tensor_parallel_size == 1:
+                logger.warning("Sequence Parallelism requires TP>1, disabling")
+                self.compilation_config.pass_config.enable_sp = False
+                self.compilation_config.pass_config.fuse_gemm_comms = False
+
+            elif "-rms_norm" in self.compilation_config.custom_ops:
                logger.warning(
                    "RMS norm force disabled, sequence parallelism might break"
                )