[Feature] Support async scheduling + PP (#32359)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
@@ -9,6 +9,7 @@ from vllm.config import (
|
||||
ECTransferConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
@@ -53,6 +54,7 @@ def create_scheduler(
|
||||
num_speculative_tokens: int | None = None,
|
||||
skip_tokenizer_init: bool = False,
|
||||
async_scheduling: bool = False,
|
||||
pipeline_parallel_size: int = 1,
|
||||
use_ec_connector: bool = False,
|
||||
ec_role: str | None = None,
|
||||
) -> Scheduler | AsyncScheduler:
|
||||
@@ -133,6 +135,7 @@ def create_scheduler(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=ParallelConfig(pipeline_parallel_size=pipeline_parallel_size),
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
ec_transfer_config=ec_transfer_config,
|
||||
|
||||
@@ -563,11 +563,6 @@ class VllmConfig:
|
||||
|
||||
if self.scheduler_config.async_scheduling:
|
||||
# Async scheduling explicitly enabled, hard fail any incompatibilities.
|
||||
if self.parallel_config.pipeline_parallel_size > 1:
|
||||
raise ValueError(
|
||||
"Async scheduling is not yet compatible with "
|
||||
"pipeline_parallel_size > 1."
|
||||
)
|
||||
# Currently, async scheduling only support eagle speculative
|
||||
# decoding.
|
||||
if self.speculative_config is not None:
|
||||
@@ -589,14 +584,7 @@ class VllmConfig:
|
||||
)
|
||||
elif self.scheduler_config.async_scheduling is None:
|
||||
# Enable async scheduling unless there is an incompatible option.
|
||||
if self.parallel_config.pipeline_parallel_size > 1:
|
||||
logger.warning_once(
|
||||
"Async scheduling is not yet supported with "
|
||||
"pipeline_parallel_size > 1 and will be disabled.",
|
||||
scope="local",
|
||||
)
|
||||
self.scheduler_config.async_scheduling = False
|
||||
elif (
|
||||
if (
|
||||
self.speculative_config is not None
|
||||
and self.speculative_config.method not in get_args(EagleModelTypes)
|
||||
):
|
||||
|
||||
@@ -283,6 +283,13 @@ class Scheduler(SchedulerInterface):
|
||||
while req_index < len(self.running) and token_budget > 0:
|
||||
request = self.running[req_index]
|
||||
|
||||
# do not schedule another step for the same request while it still has
|
||||
# output placeholders for PP.
|
||||
# TODO: support PP + async scheduling without this limit
|
||||
if self.use_pp and request.num_output_placeholders > 0:
|
||||
req_index += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
request.num_output_placeholders > 0
|
||||
# This is (num_computed_tokens + 1) - (num_output_placeholders - 1).
|
||||
|
||||
@@ -411,9 +411,9 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
@cached_property
|
||||
def max_concurrent_batches(self) -> int:
|
||||
if self.scheduler_config.async_scheduling:
|
||||
return 2
|
||||
return self.parallel_config.pipeline_parallel_size
|
||||
# PP requires PP-size concurrent batches to fill the pipeline.
|
||||
pp_size = self.parallel_config.pipeline_parallel_size
|
||||
return 2 if pp_size <= 1 and self.scheduler_config.async_scheduling else pp_size
|
||||
|
||||
def _get_output_rank(self) -> int:
|
||||
# Only returns ModelRunnerOutput from TP rank=0 and PP rank=-1
|
||||
|
||||
@@ -111,9 +111,8 @@ class RayDistributedExecutor(Executor):
|
||||
"""Ray distributed executor supports pipeline parallelism,
|
||||
meaning that it allows PP size batches to be executed concurrently.
|
||||
"""
|
||||
if self.scheduler_config.async_scheduling:
|
||||
return 2
|
||||
return self.parallel_config.pipeline_parallel_size
|
||||
pp_size = self.parallel_config.pipeline_parallel_size
|
||||
return 2 if pp_size <= 1 and self.scheduler_config.async_scheduling else pp_size
|
||||
|
||||
def shutdown(self) -> None:
|
||||
if logger:
|
||||
|
||||
Reference in New Issue
Block a user