[V0 Deprecation] Remove multi-step scheduling (#22138)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
This commit is contained in:
Woosuk Kwon
2025-08-12 20:18:39 -07:00
committed by GitHub
parent e18859298d
commit 71683ca6f6
37 changed files with 57 additions and 3465 deletions

View File

@@ -362,8 +362,6 @@ class EngineArgs:
lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
num_gpu_blocks_override: Optional[
int] = CacheConfig.num_gpu_blocks_override
@@ -799,11 +797,8 @@ class EngineArgs:
**scheduler_kwargs["delay_factor"])
scheduler_group.add_argument("--preemption-mode",
**scheduler_kwargs["preemption_mode"])
scheduler_group.add_argument("--num-scheduler-steps",
**scheduler_kwargs["num_scheduler_steps"])
scheduler_group.add_argument(
"--multi-step-stream-outputs",
**scheduler_kwargs["multi_step_stream_outputs"])
# multi-step scheduling has been removed; corresponding arguments
# are no longer supported.
scheduler_group.add_argument("--scheduling-policy",
**scheduler_kwargs["policy"])
scheduler_group.add_argument(
@@ -1257,28 +1252,11 @@ class EngineArgs:
disable_log_stats=self.disable_log_stats,
)
# Reminder: Please update docs/features/compatibility_matrix.md
# If the feature combo become valid
if self.num_scheduler_steps > 1:
if speculative_config is not None:
raise ValueError("Speculative decoding is not supported with "
"multi-step (--num-scheduler-steps > 1)")
if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
raise ValueError("Multi-Step Chunked-Prefill is not supported "
"for pipeline-parallel-size > 1")
if current_platform.is_cpu():
logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
"currently not supported for CPUs and has been "
"disabled.")
self.num_scheduler_steps = 1
# make sure num_lookahead_slots is set the higher value depending on
# if we are using speculative decoding or multi-step
num_lookahead_slots = max(self.num_lookahead_slots,
self.num_scheduler_steps - 1)
num_lookahead_slots = num_lookahead_slots \
if speculative_config is None \
else speculative_config.num_lookahead_slots
# make sure num_lookahead_slots is set appropriately depending on
# whether speculative decoding is enabled
num_lookahead_slots = self.num_lookahead_slots
if speculative_config is not None:
num_lookahead_slots = speculative_config.num_lookahead_slots
scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
@@ -1292,8 +1270,6 @@ class EngineArgs:
disable_chunked_mm_input=self.disable_chunked_mm_input,
is_multimodal_model=model_config.is_multimodal_model,
preemption_mode=self.preemption_mode,
num_scheduler_steps=self.num_scheduler_steps,
multi_step_stream_outputs=self.multi_step_stream_outputs,
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
and parallel_config.use_ray),
policy=self.scheduling_policy,
@@ -1392,11 +1368,6 @@ class EngineArgs:
recommend_to_remove=True)
return False
if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
_raise_or_fallback(feature_name="--num-scheduler-steps",
recommend_to_remove=True)
return False
if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
_raise_or_fallback(feature_name="--scheduler-delay-factor",
recommend_to_remove=True)