[Core] Async scheduling + structured outputs compatibility (#26866)

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-10-31 17:35:04 -07:00
parent df334868ca
commit 0cdbe7b744
25 changed files with 419 additions and 191 deletions
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -46,7 +46,7 @@ from vllm.utils.system_utils import (
    get_mp_context,
    set_process_title,
 )
-from vllm.v1.core.sched.output import SchedulerOutput
+from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput
 from vllm.v1.executor.abstract import Executor, FailureCallback
 from vllm.v1.outputs import AsyncModelRunnerOutput, DraftTokenIds, ModelRunnerOutput
 from vllm.v1.worker.worker_base import WorkerWrapperBase
@@ -132,15 +132,12 @@ class MultiprocExecutor(Executor):
                        uw.death_writer.close()
                self._ensure_worker_termination([uw.proc for uw in unready_workers])

-        # For pipeline parallel, we use a thread pool for asynchronous
-        # execute_model.
-        if self.max_concurrent_batches > 1:
-            # Note: must use only 1 IO thread to keep dequeue sequence
-            # from the response queue
-            # _async_aggregate_workers_output also assumes a single IO thread
-            self.io_thread_pool = ThreadPoolExecutor(
-                max_workers=1, thread_name_prefix="mp_exec_io"
-            )
+        # Note: must use only 1 IO thread to keep dequeue sequence
+        # from the response queue.
+        # _async_aggregate_workers_output also assumes a single IO thread.
+        self.io_thread_pool = ThreadPoolExecutor(
+            max_workers=1, thread_name_prefix="mp_exec_io"
+        )

        self.output_rank = self._get_output_rank()
        self.has_connector = self.vllm_config.kv_transfer_config is not None
@@ -180,15 +177,27 @@ class MultiprocExecutor(Executor):
            self.failure_callback = callback

    def execute_model(  # type: ignore[override]
-        self,
-        scheduler_output: SchedulerOutput,
-        non_block: bool = False,
+        self, scheduler_output: SchedulerOutput, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
+        return self._execute_with_aggregation(
+            "execute_model", scheduler_output, non_block=non_block
+        )
+
+    def sample_tokens(  # type: ignore[override]
+        self, grammar_output: GrammarOutput | None, non_block: bool = False
    ) -> ModelRunnerOutput | Future[ModelRunnerOutput]:
+        return self._execute_with_aggregation(  # type: ignore[return-value]
+            "sample_tokens", grammar_output, non_block=non_block
+        )
+
+    def _execute_with_aggregation(
+        self, method: str, *args, non_block: bool = False
+    ) -> ModelRunnerOutput | None | Future[ModelRunnerOutput | None]:
        if not self.has_connector:
            # get output only from a single worker (output_rank)
            (output,) = self.collective_rpc(
-                "execute_model",
-                args=(scheduler_output,),
+                method,
+                args=args,
                unique_reply_rank=self.output_rank,
                non_block=non_block,
                timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
@@ -197,8 +206,8 @@ class MultiprocExecutor(Executor):

        # get output from all workers
        outputs = self.collective_rpc(
-            "execute_model",
-            args=(scheduler_output,),
+            method,
+            args=args,
            non_block=non_block,
            timeout=envs.VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS,
        )