[BugFix] Async scheduling and PP compatibility with DP (#23770)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-08-29 08:17:27 -07:00
committed by GitHub
parent 0a2f4c0793
commit d90d8eb674
7 changed files with 105 additions and 98 deletions

View File

@@ -191,6 +191,10 @@ class MultiprocExecutor(Executor):
outputs, self.output_rank)
return self.kv_output_aggregator.aggregate(outputs, self.output_rank)
def execute_dummy_batch(self) -> None:
self.collective_rpc("execute_dummy_batch",
unique_reply_rank=self.output_rank)
def take_draft_token_ids(self) -> Optional[DraftTokenIds]:
# OPTIMIZATION: Get output only from a single worker (output_rank)
outputs = self.collective_rpc("take_draft_token_ids",
@@ -242,12 +246,17 @@ class MultiprocExecutor(Executor):
dequeue_timeout = None if deadline is None else (
deadline - time.monotonic())
if non_block:
if self.io_thread_pool is not None:
# We must consume worker_response_mq from a single thread.
result = self.io_thread_pool.submit( # type: ignore
get_response, w, dequeue_timeout, self.shutdown_event)
else:
if not non_block:
result = result.result()
elif not non_block:
result = get_response(w, dequeue_timeout)
else:
raise RuntimeError("non_block can only be used when"
" max_concurrent_batches > 1")
responses.append(result)
return responses