[Compile] Conditional compilation. Introduce compile_ranges (#24252)

Signed-off-by: Luka Govedič <lgovedic@redhat.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Luka Govedič <luka.govedic@gmail.com> Signed-off-by: ProExpertProg <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com> Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
2025-12-05 19:17:32 +01:00
parent 66e674cdd5
commit 4e26d3b09e
15 changed files with 582 additions and 268 deletions
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -298,10 +298,14 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
        r"fusion_attn.py:\d+] Fused quant onto (\d+) attention nodes",
        log_holder.text,
    )
-    assert len(log_matches) == 2, log_holder.text
+    # 2 for each compile range
+    # (global compile range can be split due to fuse_allreduce_rmsnorm)
+    num_compile_ranges = len(compilation_config.get_compile_ranges())
+    assert num_compile_ranges in [1, 2]

-    assert int(log_matches[0]) == matches.attention_fusion
-    assert int(log_matches[1]) == matches.attention_fusion
+    assert len(log_matches) == 2 * num_compile_ranges, log_holder.text
+
+    assert all(int(log_match) == matches.attention_fusion for log_match in log_matches)

    log_matches = re.findall(
        r"collective_fusion.py:\d+] Replaced (\d+) patterns",
@@ -312,6 +316,12 @@ def test_tp2_attn_quant_allreduce_rmsnorm(
    assert int(log_matches[0]) == matches.allreduce_fusion
    assert int(log_matches[1]) == matches.allreduce_fusion

+    log_matches = re.findall(
+        r"pass_manager.py:\d+] Skipping .*AllReduceFusionPass.* with compile range",
+        log_holder.text,
+    )
+    assert len(log_matches) == 2 * (num_compile_ranges - 1), log_holder.text
+

@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
@@ -446,7 +456,6 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
    # No cudagraphs by default
    if compilation_config.cudagraph_mode is None:
        compilation_config.cudagraph_mode = CUDAGraphMode.NONE
-
    llm = LLM(
        model=model,
        compilation_config=compilation_config,
@@ -459,3 +468,9 @@ def run_model(compile_config: int | CompilationConfig, model: str, **model_kwarg
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # Get the compile ranges split points after vllm config post init
+    # in order to compute compile ranges correctly
+    compilation_config.compile_ranges_split_points = (
+        llm.llm_engine.vllm_config.compilation_config.compile_ranges_split_points
+    )