[Core] Support async scheduling with uniproc executor (#24219)
Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from concurrent.futures import Future
|
||||
from typing import Callable, Optional, Union
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@@ -14,6 +14,7 @@ from vllm.executor.uniproc_executor import ( # noqa
|
||||
from vllm.executor.uniproc_executor import ( # noqa
|
||||
UniProcExecutor as UniProcExecutorV0)
|
||||
from vllm.utils import resolve_obj_by_qualname
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
|
||||
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
|
||||
|
||||
@@ -86,12 +87,22 @@ class Executor(ExecutorBase):
|
||||
def get_kv_cache_specs(self) -> list[dict[str, KVCacheSpec]]:
|
||||
return self.collective_rpc("get_kv_cache_spec")
|
||||
|
||||
def collective_rpc(self,
|
||||
method: Union[str, Callable],
|
||||
timeout: Optional[float] = None,
|
||||
args: tuple = (),
|
||||
kwargs: Optional[dict] = None,
|
||||
non_block: bool = False) -> list[Any]:
|
||||
raise NotImplementedError
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output,
|
||||
scheduler_output: SchedulerOutput,
|
||||
non_block: bool = False,
|
||||
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
|
||||
output = self.collective_rpc("execute_model",
|
||||
args=(scheduler_output, ))
|
||||
args=(scheduler_output, ),
|
||||
non_block=non_block)
|
||||
return output[0]
|
||||
|
||||
def execute_dummy_batch(self) -> None:
|
||||
|
||||
@@ -11,7 +11,7 @@ import weakref
|
||||
from concurrent.futures import Future, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum, auto
|
||||
from functools import partial
|
||||
from functools import cached_property, partial
|
||||
from multiprocessing.connection import Connection
|
||||
from multiprocessing.process import BaseProcess
|
||||
from multiprocessing.synchronize import Lock as LockType
|
||||
@@ -37,6 +37,7 @@ from vllm.multimodal.cache import worker_receiver_cache_from_config
|
||||
from vllm.utils import (decorate_logs, get_distributed_init_method,
|
||||
get_loopback_ip, get_mp_context, get_open_port,
|
||||
set_process_title)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.executor.abstract import Executor, FailureCallback
|
||||
from vllm.v1.executor.utils import get_and_update_mm_cache
|
||||
from vllm.v1.outputs import (AsyncModelRunnerOutput, DraftTokenIds,
|
||||
@@ -174,9 +175,9 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output,
|
||||
scheduler_output: SchedulerOutput,
|
||||
non_block: bool = False,
|
||||
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
|
||||
non_block = self.max_concurrent_batches > 1
|
||||
|
||||
if not self.has_connector:
|
||||
# get output only from a single worker (output_rank)
|
||||
@@ -328,7 +329,7 @@ class MultiprocExecutor(Executor):
|
||||
self.collective_rpc("check_health", timeout=10)
|
||||
return
|
||||
|
||||
@property
|
||||
@cached_property
|
||||
def max_concurrent_batches(self) -> int:
|
||||
if self.scheduler_config.async_scheduling:
|
||||
return 2
|
||||
@@ -632,7 +633,8 @@ class WorkerProc:
|
||||
result = (WorkerProc.ResponseStatus.FAILURE, str(output))
|
||||
else:
|
||||
result = (WorkerProc.ResponseStatus.SUCCESS, output)
|
||||
self.worker_response_mq.enqueue(result)
|
||||
if (response_mq := self.worker_response_mq) is not None:
|
||||
response_mq.enqueue(result)
|
||||
|
||||
def handle_output(self, output: Any):
|
||||
"""Handles output from the worker. If async scheduling is enabled,
|
||||
|
||||
@@ -66,11 +66,13 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
|
||||
def execute_model(
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
non_block: bool = False,
|
||||
) -> Union[ModelRunnerOutput, Future[ModelRunnerOutput]]:
|
||||
"""Execute the model on the Ray workers.
|
||||
|
||||
Args:
|
||||
scheduler_output: The scheduler output to execute.
|
||||
non_block: If True, the method will return a Future.
|
||||
|
||||
Returns:
|
||||
The model runner output.
|
||||
@@ -84,7 +86,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
|
||||
if not self.has_connector:
|
||||
# Get output only from a single worker (output_rank)
|
||||
# When PP is not used, we block here until the result is available.
|
||||
if self.max_concurrent_batches == 1:
|
||||
if not non_block:
|
||||
return refs[0].get()
|
||||
|
||||
# When PP is used, we return a FutureWrapper immediately so that
|
||||
@@ -92,7 +94,7 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
|
||||
return FutureWrapper(refs)
|
||||
|
||||
# Get output from all workers when connector is present
|
||||
if self.max_concurrent_batches == 1:
|
||||
if not non_block:
|
||||
# Block and get results from all workers
|
||||
outputs = [ref.get() for ref in refs]
|
||||
return self.kv_output_aggregator.aggregate(outputs)
|
||||
@@ -106,4 +108,3 @@ class RayDistributedExecutor(RayDistributedExecutorV0, Executor):
|
||||
if reconfig_request.new_data_parallel_rank == \
|
||||
ReconfigureRankType.SHUTDOWN_CURRENT_RANK:
|
||||
self.shutdown()
|
||||
return
|
||||
Reference in New Issue
Block a user