[Feature] Support async scheduling + PP (#32359)

Signed-off-by: yewentao256 <zhyanwentao@126.com>
2026-01-15 12:06:23 -05:00
parent 6218034dd7
commit b34474bf2c
5 changed files with 16 additions and 19 deletions
--- a/tests/v1/core/utils.py
+++ b/tests/v1/core/utils.py
@@ -9,6 +9,7 @@ from vllm.config import (
    ECTransferConfig,
    KVTransferConfig,
    ModelConfig,
+    ParallelConfig,
    SchedulerConfig,
    SpeculativeConfig,
    VllmConfig,
@@ -53,6 +54,7 @@ def create_scheduler(
    num_speculative_tokens: int | None = None,
    skip_tokenizer_init: bool = False,
    async_scheduling: bool = False,
+    pipeline_parallel_size: int = 1,
    use_ec_connector: bool = False,
    ec_role: str | None = None,
 ) -> Scheduler | AsyncScheduler:
@@ -133,6 +135,7 @@ def create_scheduler(
        scheduler_config=scheduler_config,
        model_config=model_config,
        cache_config=cache_config,
+        parallel_config=ParallelConfig(pipeline_parallel_size=pipeline_parallel_size),
        kv_transfer_config=kv_transfer_config,
        speculative_config=speculative_config,
        ec_transfer_config=ec_transfer_config,
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -563,11 +563,6 @@ class VllmConfig:

        if self.scheduler_config.async_scheduling:
            # Async scheduling explicitly enabled, hard fail any incompatibilities.
-            if self.parallel_config.pipeline_parallel_size > 1:
-                raise ValueError(
-                    "Async scheduling is not yet compatible with "
-                    "pipeline_parallel_size > 1."
-                )
            # Currently, async scheduling only support eagle speculative
            # decoding.
            if self.speculative_config is not None:
@@ -589,14 +584,7 @@ class VllmConfig:
                )
        elif self.scheduler_config.async_scheduling is None:
            # Enable async scheduling unless there is an incompatible option.
-            if self.parallel_config.pipeline_parallel_size > 1:
-                logger.warning_once(
-                    "Async scheduling is not yet supported with "
-                    "pipeline_parallel_size > 1 and will be disabled.",
-                    scope="local",
-                )
-                self.scheduler_config.async_scheduling = False
-            elif (
+            if (
                self.speculative_config is not None
                and self.speculative_config.method not in get_args(EagleModelTypes)
            ):
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -283,6 +283,13 @@ class Scheduler(SchedulerInterface):
        while req_index < len(self.running) and token_budget > 0:
            request = self.running[req_index]

+            # do not schedule another step for the same request while it still has
+            # output placeholders for PP.
+            # TODO: support PP + async scheduling without this limit
+            if self.use_pp and request.num_output_placeholders > 0:
+                req_index += 1
+                continue
+
            if (
                request.num_output_placeholders > 0
                # This is (num_computed_tokens + 1) - (num_output_placeholders - 1).
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -411,9 +411,9 @@ class MultiprocExecutor(Executor):

    @cached_property
    def max_concurrent_batches(self) -> int:
-        if self.scheduler_config.async_scheduling:
-            return 2
-        return self.parallel_config.pipeline_parallel_size
+        # PP requires PP-size concurrent batches to fill the pipeline.
+        pp_size = self.parallel_config.pipeline_parallel_size
+        return 2 if pp_size <= 1 and self.scheduler_config.async_scheduling else pp_size

    def _get_output_rank(self) -> int:
        # Only returns ModelRunnerOutput from TP rank=0 and PP rank=-1
--- a/vllm/v1/executor/ray_executor.py
+++ b/vllm/v1/executor/ray_executor.py
@@ -111,9 +111,8 @@ class RayDistributedExecutor(Executor):
        """Ray distributed executor supports pipeline parallelism,
        meaning that it allows PP size batches to be executed concurrently.
        """
-        if self.scheduler_config.async_scheduling:
-            return 2
-        return self.parallel_config.pipeline_parallel_size
+        pp_size = self.parallel_config.pipeline_parallel_size
+        return 2 if pp_size <= 1 and self.scheduler_config.async_scheduling else pp_size

    def shutdown(self) -> None:
        if logger: