[V1][core] Implement pipeline parallel on Ray (#12996)

2025-02-13 00:02:46 -08:00
parent 0ccd8769fb
commit 9605c1256e
7 changed files with 110 additions and 45 deletions
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -40,10 +40,23 @@ class PPTestOptions(NamedTuple):
@dataclass
 class PPTestSettings:
    parallel_setups: List[ParallelSetup]
+    # NOTE: the length of distributed_backends and
+    # vllm_major_versions should be the same, and they
+    # are first zipped together to iterate over all
+    # test settings.
    distributed_backends: List[str]
+    # vllm major version: "0" for V0, "1" for V1
+    vllm_major_versions: List[str]
    task: TaskOption
    test_options: PPTestOptions

+    def __post_init__(self):
+        if len(self.distributed_backends) != len(self.vllm_major_versions):
+            raise ValueError(
+                f"Length mismatch: distributed_backends "
+                f"({len(self.distributed_backends)}) != "
+                f"vllm_major_versions ({len(self.vllm_major_versions)})")
+
    @staticmethod
    def detailed(
        *,
@@ -79,7 +92,9 @@ class PPTestSettings:
                              eager_mode=True,
                              chunked_prefill=False),
            ],
-            distributed_backends=["mp", "ray"],
+            # only ray is supported for V1
+            distributed_backends=["mp", "ray", "ray"],
+            vllm_major_versions=["0", "0", "1"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
@@ -108,6 +123,7 @@ class PPTestSettings:
                              chunked_prefill=False),
            ],
            distributed_backends=["mp"],
+            vllm_major_versions=["0"],
            task=task,
            test_options=PPTestOptions(multi_node_only=multi_node_only,
                                       trust_remote_code=trust_remote_code,
@@ -120,8 +136,9 @@ class PPTestSettings:
        opts = self.test_options

        for parallel_setup in self.parallel_setups:
-            for distributed_backend in self.distributed_backends:
-                yield (model_name, parallel_setup, distributed_backend,
+            for backend, vllm_major_version in zip(self.distributed_backends,
+                                                   self.vllm_major_versions):
+                yield (model_name, parallel_setup, backend, vllm_major_version,
                       self.task, opts)


@@ -244,6 +261,7 @@ def _compare_tp(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available: int,
@@ -296,10 +314,13 @@ def _compare_tp(
    if hf_overrides:
        common_args.extend(["--hf-overrides", hf_overrides])

-    if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
-            and chunked_prefill):
-        # Test Ray ADAG for a subset of the tests
+    specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    if distributed_backend == "ray" and (vllm_major_version == "1"
+                                         or specific_case):
+        # For V1, test Ray ADAG for all the tests
+        # For V0, test Ray ADAG for a subset of the tests
        pp_env = {
+            "VLLM_USE_V1": vllm_major_version,
            "VLLM_USE_RAY_COMPILED_DAG": "1",
            "VLLM_USE_RAY_SPMD_WORKER": "1",
            "VLLM_USE_RAY_COMPILED_DAG_NCCL_CHANNEL": "1",
@@ -348,8 +369,8 @@ def _compare_tp(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
    [
        params for model_name, settings in TEXT_GENERATION_MODELS.items()
        for params in settings.iter_params(model_name)
@@ -361,6 +382,7 @@ def test_tp_language_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
@@ -368,6 +390,7 @@ def test_tp_language_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
@@ -375,8 +398,8 @@ def test_tp_language_generation(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
    [
        params for model_name, settings in EMBEDDING_MODELS.items()
        for params in settings.iter_params(model_name)
@@ -388,6 +411,7 @@ def test_tp_language_embedding(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
@@ -395,6 +419,7 @@ def test_tp_language_embedding(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,
@@ -402,8 +427,8 @@ def test_tp_language_embedding(


@pytest.mark.parametrize(
-    ("model_name", "parallel_setup", "distributed_backend", "task",
-     "test_options"),
+    ("model_name", "parallel_setup", "distributed_backend",
+     "vllm_major_version", "task", "test_options"),
    [
        params for model_name, settings in MULTIMODAL_MODELS.items()
        for params in settings.iter_params(model_name)
@@ -415,6 +440,7 @@ def test_tp_multimodal_generation(
    model_name: str,
    parallel_setup: ParallelSetup,
    distributed_backend: str,
+    vllm_major_version: str,
    task: TaskOption,
    test_options: PPTestOptions,
    num_gpus_available,
@@ -422,6 +448,7 @@ def test_tp_multimodal_generation(
    _compare_tp(model_name,
                parallel_setup,
                distributed_backend,
+                vllm_major_version,
                task,
                test_options,
                num_gpus_available,