Implement Async Scheduling (#19970)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
This commit is contained in:
Woosuk Kwon
2025-07-14 23:01:46 -07:00
committed by GitHub
parent 85bd6599e4
commit d4d309409f
11 changed files with 508 additions and 148 deletions

View File

@@ -2308,6 +2308,13 @@ class SchedulerConfig:
like full attention and sliding window attention.
"""
async_scheduling: bool = False
"""EXPERIMENTAL: If set to True, perform async scheduling. This may help
reduce the CPU overheads, leading to better latency and throughput. However,
async scheduling is currently not supported with some features such as
structured outputs, speculative decoding, and pipeline parallelism.
"""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
@@ -2401,6 +2408,10 @@ class SchedulerConfig:
if not self.cuda_graph_sizes:
self.cuda_graph_sizes = [min(self.max_num_seqs * 2, 512)]
if self.async_scheduling:
self.scheduler_cls = (
"vllm.v1.core.sched.async_scheduler.AsyncScheduler")
@model_validator(mode='after')
def _verify_args(self) -> Self:
if (self.max_num_batched_tokens < self.max_model_len