[torch.compile] Fix tests for torch==2.9 inductor partition (#26116)

Signed-off-by: ProExpertProg <lgovedic@redhat.com> Signed-off-by: Luka Govedič <lgovedic@redhat.com>
2025-10-14 19:55:02 -04:00
parent 579d2e5458
commit 2dcd12d357
8 changed files with 138 additions and 72 deletions
--- a/tests/compile/piecewise/test_multiple_graphs.py
+++ b/tests/compile/piecewise/test_multiple_graphs.py
@@ -5,6 +5,7 @@ Test (piecewise) compilation with a simple model where multiple submodules
 are compiled and graph captured separately.
 """

+import pytest
 import torch
 from torch import nn

@@ -190,7 +191,12 @@ def run_model(
        return output.cpu()


-def test_multi_graph_piecewise_compile_outputs_equal():
+@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
+def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+    if use_inductor_graph_partition:
+        # FIXME(luka/boyuan): this currently fails
+        pytest.skip("Inductor graph partition not supported with multi-graph")
+
    outputs = []

    # piecewise compile
@@ -200,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
            use_cudagraph=True,
            splitting_ops=["silly::attention"],
            cudagraph_capture_sizes=[1, 2],
+            use_inductor_graph_partition=use_inductor_graph_partition,
        )
    )
    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -220,16 +227,24 @@ def test_multi_graph_piecewise_compile_outputs_equal():
    # static tensor addresses
    inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()

-    with compilation_counter.expect(
-        num_graphs_seen=2,  # two graphs for the model
-        num_piecewise_graphs_seen=6,
+    if use_inductor_graph_partition:
+        # Splitting happens at Inductor lowering level,
+        # total piecewise fx graphs is equal to total graphs
+        num_piecewise_fx = 2
+        num_piecewise_capturable_fx = 2
+    else:
        # attn_one, attn_two each has 3 piecewise graphs
        # (pre attn, post attn, silly_attention) each
-        num_piecewise_capturable_graphs_seen=4,
+        num_piecewise_fx = 6
        # attn_one, attn_two has pre attn and post attn each, total=4
-        num_backend_compilations=4,  # num_piecewise_capturable_graphs_seen
-        num_cudagraph_captured=8,
-        # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+        num_piecewise_capturable_fx = 4
+
+    with compilation_counter.expect(
+        num_graphs_seen=2,  # two graphs for the model
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
+        num_cudagraph_captured=8,  # num_cudagraph_sizes * num_partitions
    ):
        outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))

@@ -268,6 +283,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
            level=CompilationLevel.PIECEWISE,
            use_cudagraph=False,
            splitting_ops=["silly::attention"],
+            use_inductor_graph_partition=use_inductor_graph_partition,
        )
    )
    cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():

    with compilation_counter.expect(
        num_graphs_seen=2,
-        num_piecewise_graphs_seen=6,
-        num_piecewise_capturable_graphs_seen=4,
-        num_backend_compilations=4,
+        num_piecewise_graphs_seen=num_piecewise_fx,
+        num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+        num_backend_compilations=num_piecewise_capturable_fx,
        num_cudagraph_captured=0,  # no cudagraph captured
    ):
        outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))