[V0 Deprecation] Remove multi-step scheduling (#22138)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:18:39 -07:00
parent e18859298d
commit 71683ca6f6
37 changed files with 57 additions and 3465 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -362,8 +362,6 @@ class EngineArgs:
    lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
    lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size

-    num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
-    multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
    ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
    num_gpu_blocks_override: Optional[
        int] = CacheConfig.num_gpu_blocks_override
@@ -799,11 +797,8 @@ class EngineArgs:
                                     **scheduler_kwargs["delay_factor"])
        scheduler_group.add_argument("--preemption-mode",
                                     **scheduler_kwargs["preemption_mode"])
-        scheduler_group.add_argument("--num-scheduler-steps",
-                                     **scheduler_kwargs["num_scheduler_steps"])
-        scheduler_group.add_argument(
-            "--multi-step-stream-outputs",
-            **scheduler_kwargs["multi_step_stream_outputs"])
+        # multi-step scheduling has been removed; corresponding arguments
+        # are no longer supported.
        scheduler_group.add_argument("--scheduling-policy",
                                     **scheduler_kwargs["policy"])
        scheduler_group.add_argument(
@@ -1257,28 +1252,11 @@ class EngineArgs:
            disable_log_stats=self.disable_log_stats,
        )

-        # Reminder: Please update docs/features/compatibility_matrix.md
-        # If the feature combo become valid
-        if self.num_scheduler_steps > 1:
-            if speculative_config is not None:
-                raise ValueError("Speculative decoding is not supported with "
-                                 "multi-step (--num-scheduler-steps > 1)")
-            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
-                raise ValueError("Multi-Step Chunked-Prefill is not supported "
-                                 "for pipeline-parallel-size > 1")
-            if current_platform.is_cpu():
-                logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
-                               "currently not supported for CPUs and has been "
-                               "disabled.")
-                self.num_scheduler_steps = 1
-
-        # make sure num_lookahead_slots is set the higher value depending on
-        # if we are using speculative decoding or multi-step
-        num_lookahead_slots = max(self.num_lookahead_slots,
-                                  self.num_scheduler_steps - 1)
-        num_lookahead_slots = num_lookahead_slots \
-            if speculative_config is None \
-            else speculative_config.num_lookahead_slots
+        # make sure num_lookahead_slots is set appropriately depending on
+        # whether speculative decoding is enabled
+        num_lookahead_slots = self.num_lookahead_slots
+        if speculative_config is not None:
+            num_lookahead_slots = speculative_config.num_lookahead_slots

        scheduler_config = SchedulerConfig(
            runner_type=model_config.runner_type,
@@ -1292,8 +1270,6 @@ class EngineArgs:
            disable_chunked_mm_input=self.disable_chunked_mm_input,
            is_multimodal_model=model_config.is_multimodal_model,
            preemption_mode=self.preemption_mode,
-            num_scheduler_steps=self.num_scheduler_steps,
-            multi_step_stream_outputs=self.multi_step_stream_outputs,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
            policy=self.scheduling_policy,
@@ -1392,11 +1368,6 @@ class EngineArgs:
                               recommend_to_remove=True)
            return False

-        if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
-            _raise_or_fallback(feature_name="--num-scheduler-steps",
-                               recommend_to_remove=True)
-            return False
-
        if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
            _raise_or_fallback(feature_name="--scheduler-delay-factor",
                               recommend_to_remove=True)