[V1][PP] Run engine busy loop with batch queue (#13064)

2025-02-15 03:59:01 -08:00
parent ed0de3e4b8
commit 9206b3d7ec
6 changed files with 299 additions and 15 deletions
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -1,11 +1,10 @@
 # SPDX-License-Identifier: Apache-2.0

-from typing import List, Type
+from concurrent.futures import Future
+from typing import List, Type, Union

 from vllm.config import VllmConfig
 from vllm.executor.executor_base import ExecutorBase
-from vllm.executor.ray_distributed_executor import (  # noqa
-    RayDistributedExecutor as RayDistributedExecutorV0)
 from vllm.executor.uniproc_executor import (  # noqa
    ExecutorWithExternalLauncher as ExecutorWithExternalLauncherV0)
 from vllm.executor.uniproc_executor import (  # noqa
@@ -33,6 +32,8 @@ class Executor(ExecutorBase):
                    f"ExecutorBase. Got {distributed_executor_backend}.")
            executor_class = distributed_executor_backend
        elif distributed_executor_backend == "ray":
+            from vllm.v1.executor.ray_distributed_executor import (  # noqa
+                RayDistributedExecutor)
            executor_class = RayDistributedExecutor
        elif distributed_executor_backend == "mp":
            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
@@ -70,11 +71,15 @@ class Executor(ExecutorBase):
    def execute_model(
        self,
        scheduler_output,
-    ) -> ModelRunnerOutput:
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
        output = self.collective_rpc("execute_model",
                                     args=(scheduler_output, ))
        return output[0]

+    @property
+    def max_concurrent_batches(self) -> int:
+        return 1
+
    def profile(self, is_start: bool = True):
        self.collective_rpc("profile", args=(is_start, ))

@@ -85,7 +90,3 @@ class UniProcExecutor(UniProcExecutorV0, Executor):

 class ExecutorWithExternalLauncher(ExecutorWithExternalLauncherV0, Executor):
    pass
-
-
-class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
-    pass
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from concurrent.futures import Future
+from typing import Union
+
+from vllm.executor.ray_distributed_executor import (  # noqa
+    RayDistributedExecutor as RayDistributedExecutorV0)
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.outputs import ModelRunnerOutput
+
+
+class FutureWrapper(Future):
+    """A wrapper around a Ray output reference to meet the interface
+    of .execute_model().
+    """
+
+    def __init__(self, ref):
+        super().__init__()
+        self.ref = ref
+
+    def result(self, timeout=None):
+        if timeout is not None:
+            raise NotImplementedError("timeout is not supported")
+        return self.ref.get()
+
+
+class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
+    """Ray distributed executor using Ray Compiled Graphs."""
+
+    @property
+    def max_concurrent_batches(self) -> int:
+        """Ray distributed executor supports pipeline parallelism,
+        meaning that it allows PP size batches to be executed concurrently.
+        """
+        return 1  #self.vllm_config.parallel_config.pipeline_parallel_size
+
+    def execute_model(
+        self,
+        scheduler_output,
+    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
+        """Execute the model on the Ray workers.
+
+        Args:
+            scheduler_output: The scheduler output to execute.
+
+        Returns:
+            The model runner output.
+        """
+        # Build the compiled DAG for the first time.
+        if self.forward_dag is None:  # type: ignore
+            self.forward_dag = self._compiled_ray_dag(enable_asyncio=False)
+
+        refs = self.forward_dag.execute(scheduler_output)  # type: ignore
+
+        # When PP is not used, we block here until the result is available.
+        if self.max_concurrent_batches == 1:
+            return refs[0].get()
+
+        # When PP is used, we return a FutureWrapper immediately so that
+        # the scheduler can yield to the next batch.
+        return FutureWrapper(refs[0])