[Core] Support async scheduling with uniproc executor (#24219)

Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Ronald1995 <ronaldautomobile@163.com> Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-09-12 16:34:28 -07:00
parent 8226dd56bf
commit 4fdd6f5cbf
9 changed files with 103 additions and 55 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -159,6 +159,9 @@ class EngineCore:
            self.request_block_hasher = get_request_block_hasher(
                block_size, caching_hash_fn)

+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
    def _initialize_kv_caches(
            self, vllm_config: VllmConfig) -> tuple[int, int, KVCacheConfig]:
        start = time.time()
@@ -331,7 +334,8 @@ class EngineCore:
        model_executed = False
        if self.scheduler.has_requests():
            scheduler_output = self.scheduler.schedule()
-            future = self.model_executor.execute_model(scheduler_output)
+            future = self.model_executor.execute_model(scheduler_output,
+                                                       non_block=True)
            batch_queue.appendleft(
                (future, scheduler_output))  # type: ignore[arg-type]

@@ -534,9 +538,6 @@ class EngineCoreProc(EngineCore):
                assert addresses.coordinator_input is not None
                logger.info("Waiting for READY message from DP Coordinator...")

-        self.step_fn = (self.step if self.batch_queue is None else
-                        self.step_with_batch_queue)
-
        # Mark the startup heap as static so that it's ignored by GC.
        # Reduces pause times of oldest generation collections.
        gc.collect()