[Core] Support async scheduling with uniproc executor (#24219)

Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-12 16:34:28 -07:00
parent 8226dd56bf
commit 4fdd6f5cbf
9 changed files with 103 additions and 55 deletions
--- a/vllm/v1/executor/abstract.py
+++ b/vllm/v1/executor/abstract.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from concurrent.futures import Future
-from typing import Callable, Optional, Union
+from typing import Any, Callable, Optional, Union

 import torch
 import torch.distributed as dist
@@ -14,6 +14,7 @@ from vllm.executor.uniproc_executor import (  # noqa
 from vllm.executor.uniproc_executor import (  # noqa
    UniProcExecutor as UniProcExecutorV0)
 from vllm.utils import resolve_obj_by_qualname
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput

@@ -86,12 +87,22 @@ class Executor(ExecutorBase):
    def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
        return self.collective_rpc("get_kv_cache_spec")

+    def collective_rpc(self,
+                       method: Union[str, Callable],
+                       timeout: Optional[float] = None,
+                       args: tuple = (),
+                       kwargs: Optional[dict] = None,
+                       non_block: bool = False) -> list[Any]:
+        raise NotImplementedError
+
    def execute_model(
        self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
+        non_block: bool = False,
    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
        output = self.collective_rpc("execute_model",
-                                     args=(scheduler_output, ))
+                                     args=(scheduler_output, ),
+                                     non_block=non_block)
        return output[0]

    def execute_dummy_batch(self) -> None:
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -11,7 +11,7 @@ import weakref
 from concurrent.futures import Future, ThreadPoolExecutor
 from dataclasses import dataclass
 from enum import Enum, auto
-from functools import partial
+from functools import cached_property, partial
 from multiprocessing.connection import Connection
 from multiprocessing.process import BaseProcess
 from multiprocessing.synchronize import Lock as LockType
@@ -37,6 +37,7 @@ from vllm.multimodal.cache import worker_receiver_cache_from_config
 from vllm.utils import (decorate_logs, get_distributed_init_method,
                        get_loopback_ip, get_mp_context, get_open_port,
                        set_process_title)
+from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.executor.utils import get_and_update_mm_cache
 from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
@@ -174,9 +175,9 @@ class MultiprocExecutor(Executor):

    def execute_model(
        self,
-        scheduler_output,
+        scheduler_output: SchedulerOutput,
+        non_block: bool = False,
    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
-        non_block = self.max_concurrent_batches > 1

        if not self.has_connector:
            # get output only from a single worker (output_rank)
@@ -328,7 +329,7 @@ class MultiprocExecutor(Executor):
        self.collective_rpc("check_health", timeout=10)
        return

-    @property
+    @cached_property
    def max_concurrent_batches(self) -> int:
        if self.scheduler_config.async_scheduling:
            return 2
@@ -632,7 +633,8 @@ class WorkerProc:
            result = (WorkerProc.ResponseStatus.FAILURE, str(output))
        else:
            result = (WorkerProc.ResponseStatus.SUCCESS, output)
-        self.worker_response_mq.enqueue(result)
+        if (response_mq := self.worker_response_mq) is not None:
+            response_mq.enqueue(result)

    def handle_output(self, output: Any):
        """Handles output from the worker. If async scheduling is enabled,
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
@@ -66,11 +66,13 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
    def execute_model(
        self,
        scheduler_output: SchedulerOutput,
+        non_block: bool = False,
    ) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
        """Execute the model on the Ray workers.

        Args:
            scheduler_output: The scheduler output to execute.
+            non_block: If True, the method will return a Future.

        Returns:
            The model runner output.
@@ -84,7 +86,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
        if not self.has_connector:
            # Get output only from a single worker (output_rank)
            # When PP is not used, we block here until the result is available.
-            if self.max_concurrent_batches == 1:
+            if not non_block:
                return refs[0].get()

            # When PP is used, we return a FutureWrapper immediately so that
@@ -92,7 +94,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
            return FutureWrapper(refs)

        # Get output from all workers when connector is present
-        if self.max_concurrent_batches == 1:
+        if not non_block:
            # Block and get results from all workers
            outputs = [ref.get() for ref in refs]
            return self.kv_output_aggregator.aggregate(outputs)
@@ -106,4 +108,3 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
        if reconfig_request.new_data_parallel_rank == \
        ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
            self.shutdown()
-        return