[CI][torch.compile] Reduce e2e fusion test time (#33293)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: ProExpertProg <luka.govedic@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-02-04 19:09:03 -05:00
parent 439afa4eea
commit 4d9513537d
17 changed files with 1068 additions and 821 deletions
--- a/tests/compile/fusions_e2e/init.py
+++ b/tests/compile/fusions_e2e/init.py
--- a/tests/compile/fusions_e2e/common.py
+++ b/tests/compile/fusions_e2e/common.py
@@ -0,0 +1,102 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import itertools
+from collections.abc import Callable, Iterable
+from typing import Any, NamedTuple
+
+import pytest
+import regex as re
+
+from vllm.platforms import current_platform
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+
+class Matches(NamedTuple):
+    # simple pointwise
+    rms_quant_fusion: int = 0
+    act_quant_fusion: int = 0
+    norm_rope_fusion: int = 0
+    attn_quant_fusion: int = 0
+    # distributed
+    ar_rms_fusion: int = 0
+    sequence_parallel: int = 0
+    async_tp: int = 0
+
+
+class ModelFusionInfo(NamedTuple):
+    model_name: str
+    matches: Callable[[int], Matches]
+    """Given number of hidden layers, produces the matches object"""
+    model_kwargs: dict[str, Any] = {}
+    hf_overrides: Callable[[int], dict] = lambda n: {"num_hidden_layers": n}
+
+
+class AttentionBackendCase(NamedTuple):
+    backend: AttentionBackendEnum
+    model_kwargs: dict[str, Any] = {}
+    """Additional args required for attn+quant fusion"""
+
+
+is_blackwell = lambda: current_platform.is_device_capability_family(100)
+"""Are we running on Blackwell, a lot of tests depend on it"""
+
+
+def custom_ops_combos(*custom_ops: str) -> Iterable[str]:
+    """Generate all combinations of custom ops for parametrization."""
+    custom_ops_lists = [[f"-{op}", f"+{op}"] for op in custom_ops]
+    for op_list in itertools.product(*custom_ops_lists):
+        yield ",".join(op_list)
+
+
+# Quick inline validation
+assert list(custom_ops_combos("silu_and_mul")) == ["-silu_and_mul", "+silu_and_mul"]
+assert list(custom_ops_combos("quant_fp8", "rms_norm")) == [
+    "-quant_fp8,-rms_norm",
+    "-quant_fp8,+rms_norm",
+    "+quant_fp8,-rms_norm",
+    "+quant_fp8,+rms_norm",
+]
+
+
+def has_cuda_graph_wrapper_metadata() -> bool:
+    from importlib import import_module
+
+    try:
+        module = import_module("torch._inductor.utils")
+        module.CUDAGraphWrapperMetadata  # noqa B018
+    except AttributeError:
+        return False
+    return True
+
+
+INDUCTOR_GRAPH_PARTITION = [
+    pytest.param(
+        True,
+        marks=pytest.mark.skipif(
+            not has_cuda_graph_wrapper_metadata(),
+            reason="torch version does not support Inductor partition",
+        ),
+        id="inductor_partition",
+    ),
+    pytest.param(False, id="dynamo_partition"),
+]
+
+FUSION_LOG_PATTERNS: dict[str, re.Pattern] = {
+    "rms_quant_fusion": re.compile(
+        r"\[(?:compilation/)?fusion.py:\d+] Replaced (\d+) patterns"
+    ),
+    "act_quant_fusion": re.compile(
+        r"activation_quant_fusion.py:\d+] Replaced (\d+) patterns"
+    ),
+    "norm_rope_fusion": re.compile(
+        r"qk_norm_rope_fusion.py:\d+] Fused QK Norm\+RoPE on (\d+) sites"
+    ),
+    "attn_quant_fusion": re.compile(
+        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes"
+    ),
+    "ar_rms_fusion": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
+    "sequence_parallel": re.compile(
+        r"sequence_parallelism.py:\d+] Replaced (\d+) patterns"
+    ),
+    "async_tp": re.compile(r"collective_fusion.py:\d+] Replaced (\d+) patterns"),
+}
--- a/tests/compile/fusions_e2e/conftest.py
+++ b/tests/compile/fusions_e2e/conftest.py
@@ -0,0 +1,158 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import logging
+
+import pytest
+import regex as re
+
+from vllm import LLM, SamplingParams
+from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode
+
+from .common import FUSION_LOG_PATTERNS, AttentionBackendCase, Matches
+
+
+def run_model(compile_config: int | CompilationConfig, model: str, **model_kwargs):
+    """Run a model with the given compilation config for E2E fusion tests."""
+    compilation_config = (
+        compile_config
+        if isinstance(compile_config, CompilationConfig)
+        else CompilationConfig(mode=compile_config)
+    )
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+    sampling_params = SamplingParams(temperature=0)
+    # Allow override from model_kwargs
+    model_kwargs = {"tensor_parallel_size": 1, **model_kwargs}
+    model_kwargs = {"disable_custom_all_reduce": True, **model_kwargs}
+
+    # No cudagraphs by default
+    if compilation_config.cudagraph_mode is None:
+        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
+    llm = LLM(
+        model=model,
+        compilation_config=compilation_config,
+        **model_kwargs,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Get the compile ranges split points after vllm config post init
+    # in order to compute compile ranges correctly
+    compilation_config.compile_ranges_split_points = (
+        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
+    )
+
+
+@pytest.fixture
+def run_e2e_fusion_test(monkeypatch, caplog_mp_spawn):
+    def run(
+        model_name: str,
+        matches: Matches,
+        model_kwargs: dict,
+        attn_backend: AttentionBackendCase,
+        compilation_config: dict,
+        matches_check: list[str],
+        use_deepgemm: bool = False,
+        tp_size: int = 1,
+    ):
+        monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "1" if use_deepgemm else "0")
+
+        # Disable, compile cache to make sure custom passes run.
+        # Otherwise, we can't verify fusion happened through the logs.
+        monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+        # To capture subprocess logs, we need to know whether spawn or fork is used.
+        # Force spawn as it is more general.
+        monkeypatch.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
+
+        model_kwargs = {**attn_backend.model_kwargs, **model_kwargs}
+        model_kwargs["attention_config"] = {"backend": attn_backend.backend.name}
+        model_kwargs["tensor_parallel_size"] = tp_size
+
+        # Always compile the full graph instead of piecewise
+        if not compilation_config["use_inductor_graph_partition"]:
+            compilation_config["splitting_ops"] = []
+
+        full_compilation_config = CompilationConfig(
+            cudagraph_mode=CUDAGraphMode.NONE,
+            mode=CompilationMode.VLLM_COMPILE,
+            inductor_compile_config={"force_disable_caches": True},
+            **compilation_config,
+        )
+
+        with caplog_mp_spawn(logging.DEBUG) as log_holder:
+            run_model(full_compilation_config, model_name, **model_kwargs)
+
+        num_compile_ranges = len(full_compilation_config.get_compile_ranges())
+        assert num_compile_ranges in [1, 2]
+
+        print(f"Compile ranges: {full_compilation_config.get_compile_ranges()}")
+        print("Fusion results:")
+
+        # Iterate through all so printing happens before asserting
+        log_matches_dict = {}
+        for match_name, pattern in FUSION_LOG_PATTERNS.items():
+            log_matches_dict[match_name] = list(pattern.findall(log_holder.text))
+            print(f"- {match_name}={','.join(log_matches_dict[match_name])}")
+
+        # Now check the matches
+        for match_name in matches_check:
+            num_ranges_activated = (
+                1 if match_name == "ar_rms_fusion" else num_compile_ranges
+            )
+            n_expected = tp_size * num_ranges_activated
+
+            log_matches = list(int(ms) for ms in log_matches_dict[match_name])
+            assert len(log_matches) == n_expected, (
+                f"Could not find {n_expected} {match_name} "
+                f"(found {len(log_matches)}) in:\n {log_holder.text}"
+            )
+
+            expected_matches = getattr(matches, match_name)
+
+            if match_name == "rms_quant_fusion" and "ar_rms_fusion" in matches_check:
+                # AR+rms+quant takes precedence over rms+quant if activated.
+                # That means we get full matching where ar+rms+quant was not activated,
+                # and less where it was
+                assert sum(m == expected_matches for m in log_matches) == tp_size * (
+                    num_ranges_activated - 1
+                ), "Expecting full rms+quant fusion where ar+rms+quant not activated"
+
+                assert all(
+                    expected_matches - matches.ar_rms_fusion <= m <= expected_matches
+                    for m in log_matches
+                ), (
+                    f"Expecting at least {expected_matches - matches.ar_rms_fusion} "
+                    f"where ar+rms+quant was activated"
+                )
+            else:
+                expected_matches_list = [expected_matches] * n_expected
+                assert sorted(log_matches) == expected_matches_list, (
+                    f"{match_name} expected: {expected_matches_list}, "
+                    f"found: {sorted(log_matches)}"
+                )
+
+            if match_name == "ar_rms_fusion":
+                log_matches = re.findall(
+                    r"pass_manager.py:\d+] Skipping "
+                    r".*AllReduceFusionPass.* with compile range",
+                    log_holder.text,
+                )
+
+                n_expected = tp_size * (num_compile_ranges - num_ranges_activated)
+                assert len(log_matches) == n_expected, (
+                    f'Could not find {n_expected} "Skipping AllReduceFusionPass" '
+                    f"(found {len(log_matches)}) in:\n {log_holder.text}"
+                )
+
+    return run
--- a/tests/compile/fusions_e2e/models.py
+++ b/tests/compile/fusions_e2e/models.py
@@ -0,0 +1,112 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.utils.flashinfer import has_flashinfer
+from vllm.v1.attention.backends.registry import AttentionBackendEnum
+
+from .common import AttentionBackendCase, Matches, ModelFusionInfo, is_blackwell
+
+# Attn backends
+FLASHINFER_ATTN = pytest.param(
+    AttentionBackendCase(
+        backend=AttentionBackendEnum.FLASHINFER,
+        model_kwargs=dict(kv_cache_dtype="fp8"),
+    ),
+    id="FLASHINFER",
+    marks=pytest.mark.skipif(
+        not is_blackwell() or not has_flashinfer(),
+        reason="FI backend requires Blackwell and FlashInfer",
+    ),
+)
+
+TRITON_ATTN = pytest.param(
+    AttentionBackendCase(backend=AttentionBackendEnum.TRITON_ATTN), id="TRITON_ATTN"
+)
+
+# Models
+llama3_8b = ModelFusionInfo(
+    model_name="meta-llama/Llama-3.1-8B-Instruct",
+    matches=lambda n_layers: Matches(
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+llama3_8b_fp8 = ModelFusionInfo(
+    model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers * 2,
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+llama3_8b_fp4 = ModelFusionInfo(
+    model_name="nvidia/Llama-3.1-8B-Instruct-FP4",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=0,
+        act_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 4,
+    ),
+)
+
+# MoEs cannot do act+quant fusion because those ops are hidden from torch.compile.
+# MoEs also only expose 1 rms+quant fusion because the quant for up_proj is hidden.
+# TODO(luka): https://github.com/vllm-project/vllm/issues/31985
+# Also, for MoEs, gemm+collective fusion only happens for dense GEMMs (o_proj/qkv proj)
+
+llama4_scout_fp8 = ModelFusionInfo(
+    model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP8",
+    hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
+    ),
+)
+
+llama4_scout_fp4 = ModelFusionInfo(
+    model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4",
+    hf_overrides=lambda n_layers: {"text_config": {"num_hidden_layers": n_layers}},
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=0,
+        attn_quant_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2,
+        sequence_parallel=n_layers * 2,
+        async_tp=n_layers * 2 - 1,
+    ),
+)
+
+qwen3_a3b = ModelFusionInfo(
+    model_name="Qwen/Qwen3-30B-A3B",
+    matches=lambda n_layers: Matches(
+        norm_rope_fusion=n_layers,
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)
+
+qwen3_a3b_fp8 = ModelFusionInfo(
+    model_name="Qwen/Qwen3-30B-A3B-FP8",
+    matches=lambda n_layers: Matches(
+        rms_quant_fusion=n_layers,
+        # TODO broken on Blackwell:
+        # https://github.com/vllm-project/vllm/issues/33295
+        norm_rope_fusion=0 if is_blackwell() else n_layers,
+        attn_quant_fusion=0,  # attn + group quant not supported
+        ar_rms_fusion=n_layers * 2 + 1,
+        sequence_parallel=n_layers * 2 + 1,
+        async_tp=n_layers * 2,
+    ),
+)
--- a/tests/compile/fusions_e2e/test_tp1_quant.py
+++ b/tests/compile/fusions_e2e/test_tp1_quant.py
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b_fp4,
+    llama3_8b_fp8,
+    llama4_scout_fp4,
+    llama4_scout_fp8,
+    qwen3_a3b_fp8,
+)
+
+
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides, use_deepgemm",
+    [
+        (*llama3_8b_fp8, False),
+        (*llama4_scout_fp8, False),
+        (*qwen3_a3b_fp8, False),
+        (*qwen3_a3b_fp8, True),
+    ],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [6])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp1_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    use_deepgemm: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    if use_deepgemm:
+        # TODO(luka/eliza) DeepGEMM uses different quants, matching not supported
+        #  - on Blackwell, uses a special quant fp8, currently not supported
+        #  - on Hopper, tma-aligned scales inhibit matching (fix WIP)
+        pytest.skip("DeepGEMM & quant matching not currently supported")
+
+    matches = matches_fn(n_layers)
+
+    if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
+        # This is why config forces +quant_fp8 by default
+        pytest.skip("native QuantFP8 matching not supported for group quant")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        use_deepgemm=use_deepgemm,
+    )
+
+
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4, llama4_scout_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [6])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+def test_tp1_fp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+        ),
+    )
+
+    matches_check = ["act_quant_fusion", "attn_quant_fusion", "norm_rope_fusion"]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+    )
--- a/tests/compile/fusions_e2e/test_tp2_ar_rms.py
+++ b/tests/compile/fusions_e2e/test_tp2_ar_rms.py
@@ -0,0 +1,199 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from ...utils import multi_gpu_test
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b,
+    llama3_8b_fp4,
+    llama3_8b_fp8,
+    llama4_scout_fp4,
+    llama4_scout_fp8,
+    qwen3_a3b,
+    qwen3_a3b_fp8,
+)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    # qwen3-fp8 should still fuse AR+rms even though group quant is not yet supported
+    [llama3_8b_fp8, llama4_scout_fp8, qwen3_a3b_fp8],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_ar_rms_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    if "qwen" in model_name.lower() and "-quant_fp8" in custom_ops:
+        # This is why config forces +quant_fp8 by default
+        pytest.skip("native QuantFP8 matching not supported for group quant")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp4, llama4_scout_fp4],
+)
+@pytest.mark.parametrize("attn_backend", [FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+@pytest.mark.skipif(not is_blackwell(), reason="Blackwell required for fp4")
+def test_tp2_ar_rms_fp4_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "act_quant_fusion",
+        "attn_quant_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b, qwen3_a3b],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_ar_rms_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            enable_qk_norm_rope_fusion=True,
+            fuse_allreduce_rms=True,
+        ),
+    )
+
+    matches_check = [
+        "norm_rope_fusion",
+        "ar_rms_fusion",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
--- a/tests/compile/fusions_e2e/test_tp2_async_tp.py
+++ b/tests/compile/fusions_e2e/test_tp2_async_tp.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable
+
+import pytest
+
+from vllm.config import PassConfig
+
+from ...utils import multi_gpu_test
+from .common import (
+    INDUCTOR_GRAPH_PARTITION,
+    AttentionBackendCase,
+    Matches,
+    custom_ops_combos,
+    is_blackwell,
+)
+from .models import (
+    FLASHINFER_ATTN,
+    TRITON_ATTN,
+    llama3_8b,
+    llama3_8b_fp8,
+    llama4_scout_fp8,
+    qwen3_a3b,
+)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b_fp8, llama4_scout_fp8],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN, FLASHINFER_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("quant_fp8", "rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_async_tp_fp8_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+    monkeypatch,
+):
+    matches = matches_fn(n_layers)
+
+    if is_blackwell():
+        # Disable FlashInfer scaled_mm FP8 as it's not supported in async tp patterns
+        monkeypatch.setenv("VLLM_DISABLED_KERNELS", "FlashInferFP8ScaledMMLinearKernel")
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            fuse_norm_quant=True,
+            fuse_act_quant=True,
+            fuse_attn_quant=True,
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+        ),
+    )
+
+    matches_check = [
+        "rms_quant_fusion",
+        "act_quant_fusion",
+        "norm_rope_fusion",
+        "attn_quant_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model_name, matches_fn, model_kwargs, hf_overrides",
+    [llama3_8b, qwen3_a3b],
+)
+@pytest.mark.parametrize("attn_backend", [TRITON_ATTN])
+@pytest.mark.parametrize("n_layers", [4])
+@pytest.mark.parametrize("custom_ops", custom_ops_combos("rms_norm"))
+@pytest.mark.parametrize("inductor_graph_partition", INDUCTOR_GRAPH_PARTITION)
+def test_tp2_async_tp_fusions(
+    model_name: str,
+    matches_fn: Callable[[int], Matches],
+    model_kwargs: dict,
+    hf_overrides: Callable[[int], dict],
+    attn_backend: AttentionBackendCase,
+    n_layers: int,
+    custom_ops: str,
+    inductor_graph_partition: bool,
+    run_e2e_fusion_test,
+):
+    matches = matches_fn(n_layers)
+
+    # Reduce size of model and skip weight loading time
+    model_kwargs["hf_overrides"] = hf_overrides(n_layers)
+    model_kwargs["load_format"] = "dummy"
+    model_kwargs["max_model_len"] = 1024
+
+    compilation_config = dict(
+        use_inductor_graph_partition=inductor_graph_partition,
+        custom_ops=custom_ops.split(","),
+        pass_config=PassConfig(
+            enable_qk_norm_rope_fusion=True,
+            enable_sp=True,
+            fuse_gemm_comms=True,
+        ),
+    )
+
+    matches_check = [
+        "norm_rope_fusion",
+        "sequence_parallel",
+        "async_tp",
+    ]
+
+    run_e2e_fusion_test(
+        model_name,
+        matches,
+        model_kwargs,
+        attn_backend,
+        compilation_config,
+        matches_check,
+        tp_size=2,
+    )