[compile] Enable sequence parallelism matching w/o custom ops enabled (#27126)

Signed-off-by: angelayi <yiangela7@gmail.com> Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Signed-off-by: ProExpertProg <lgovedic@redhat.com> Co-authored-by: Luka Govedič <lgovedic@redhat.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <luka.govedic@gmail.com>
2025-11-15 03:46:12 -08:00
parent 173b356abf
commit f36292dbee
6 changed files with 472 additions and 444 deletions
--- a/tests/distributed/test_sequence_parallel.py
+++ b/tests/distributed/test_sequence_parallel.py
@@ -18,6 +18,7 @@ import pytest
 from vllm.config.compilation import CompilationMode
 from vllm.config.model import RunnerOption
 from vllm.logger import init_logger
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer

 from ..models.registry import HF_EXAMPLE_MODELS
@@ -161,6 +162,7 @@ def _compare_sp(
    test_options: SPTestOptions,
    num_gpus_available: int,
    use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
    *,
    method: Literal["generate", "encode"],
    is_multimodal: bool,
@@ -244,10 +246,10 @@ def _compare_sp(

    compilation_config = {
        "mode": CompilationMode.VLLM_COMPILE,
-        "custom_ops": ["+rms_norm"],
        "compile_sizes": [4, 8],
        "pass_config": {
            "enable_sequence_parallelism": True,
+            "enable_async_tp": enable_async_tp,
            "enable_fusion": enable_fusion,
            "enable_noop": True,
        },
@@ -307,6 +309,7 @@ SP_TEST_MODELS = [
    ],
 )
@pytest.mark.parametrize("use_inductor_graph_partition", [True, False])
+@pytest.mark.parametrize("enable_async_tp", [False])  # TODO: enable async TP
@create_new_process_for_each_test()
 def test_tp_sp_generation(
    model_id: str,
@@ -316,10 +319,19 @@ def test_tp_sp_generation(
    test_options: SPTestOptions,
    num_gpus_available,
    use_inductor_graph_partition: bool,
+    enable_async_tp: bool,
 ):
    if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
        pytest.skip("inductor graph partition is only available in PyTorch 2.9+")

+    # Skip FP8 SP-only test on sm89 (compute capability 8.9)
+    if (
+        "fp8" in model_id.lower()
+        and current_platform.get_device_capability() < (9, 0)
+        and (not enable_async_tp)
+    ):
+        pytest.skip("FP8 reduction support begins with sm90 capable devices.")
+
    _compare_sp(
        model_id,
        parallel_setup,
@@ -328,6 +340,7 @@ def test_tp_sp_generation(
        test_options,
        num_gpus_available,
        use_inductor_graph_partition,
+        enable_async_tp=enable_async_tp,
        method="generate",
        is_multimodal=False,
    )