[Core] Async scheduling + structured outputs compatibility (#26866)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -46,7 +46,7 @@ from vllm.utils.system_utils import (
|
||||
get_mp_context,
|
||||
set_process_title,
|
||||
)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
|
||||
from vllm.v1.executor.abstract import Executor, FailureCallback
|
||||
from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
|
||||
from vllm.v1.worker.worker_base import WorkerWrapperBase
|
||||
@@ -132,15 +132,12 @@ class MultiprocExecutor(Executor):
|
||||
uw.death_writer.close()
|
||||
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
||||
|
||||
# For pipeline parallel, we use a thread pool for asynchronous
|
||||
# execute_model.
|
||||
if self.max_concurrent_batches > 1:
|
||||
# Note: must use only 1 IO thread to keep dequeue sequence
|
||||
# from the response queue
|
||||
# _async_aggregate_workers_output also assumes a single IO thread
|
||||
self.io_thread_pool = ThreadPoolExecutor(
|
||||
max_workers=1, thread_name_prefix="mp_exec_io"
|
||||
)
|
||||
# Note: must use only 1 IO thread to keep dequeue sequence
|
||||
# from the response queue.
|
||||
# _async_aggregate_workers_output also assumes a single IO thread.
|
||||
self.io_thread_pool = ThreadPoolExecutor(
|
||||
max_workers=1, thread_name_prefix="mp_exec_io"
|
||||
)
|
||||
|
||||
self.output_rank = self._get_output_rank()
|
||||
self.has_connector = self.vllm_config.kv_transfer_config is not None
|
||||
@@ -180,15 +177,27 @@ class MultiprocExecutor(Executor):
|
||||
self.failure_callback = callback
|
||||
|
||||
def execute_model( # type: ignore[override]
|
||||
self,
|
||||
scheduler_output: SchedulerOutput,
|
||||
non_block: bool = False,
|
||||
self, scheduler_output: SchedulerOutput, non_block: bool = False
|
||||
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
|
||||
return self._execute_with_aggregation(
|
||||
"execute_model", scheduler_output, non_block=non_block
|
||||
)
|
||||
|
||||
def sample_tokens( # type: ignore[override]
|
||||
self, grammar_output: GrammarOutput | None, non_block: bool = False
|
||||
) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
|
||||
return self._execute_with_aggregation( # type: ignore[return-value]
|
||||
"sample_tokens", grammar_output, non_block=non_block
|
||||
)
|
||||
|
||||
def _execute_with_aggregation(
|
||||
self, method: str, *args, non_block: bool = False
|
||||
) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
|
||||
if not self.has_connector:
|
||||
# get output only from a single worker (output_rank)
|
||||
(output,) = self.collective_rpc(
|
||||
"execute_model",
|
||||
args=(scheduler_output,),
|
||||
method,
|
||||
args=args,
|
||||
unique_reply_rank=self.output_rank,
|
||||
non_block=non_block,
|
||||
timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
|
||||
@@ -197,8 +206,8 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
# get output from all workers
|
||||
outputs = self.collective_rpc(
|
||||
"execute_model",
|
||||
args=(scheduler_output,),
|
||||
method,
|
||||
args=args,
|
||||
non_block=non_block,
|
||||
timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user