[V0 Deprecation] Remove multi-step scheduling (#22138)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
This commit is contained in:
@@ -362,8 +362,6 @@ class EngineArgs:
|
||||
lora_dtype: Optional[Union[str, torch.dtype]] = LoRAConfig.lora_dtype
|
||||
lora_extra_vocab_size: int = LoRAConfig.lora_extra_vocab_size
|
||||
|
||||
num_scheduler_steps: int = SchedulerConfig.num_scheduler_steps
|
||||
multi_step_stream_outputs: bool = SchedulerConfig.multi_step_stream_outputs
|
||||
ray_workers_use_nsight: bool = ParallelConfig.ray_workers_use_nsight
|
||||
num_gpu_blocks_override: Optional[
|
||||
int] = CacheConfig.num_gpu_blocks_override
|
||||
@@ -799,11 +797,8 @@ class EngineArgs:
|
||||
**scheduler_kwargs["delay_factor"])
|
||||
scheduler_group.add_argument("--preemption-mode",
|
||||
**scheduler_kwargs["preemption_mode"])
|
||||
scheduler_group.add_argument("--num-scheduler-steps",
|
||||
**scheduler_kwargs["num_scheduler_steps"])
|
||||
scheduler_group.add_argument(
|
||||
"--multi-step-stream-outputs",
|
||||
**scheduler_kwargs["multi_step_stream_outputs"])
|
||||
# multi-step scheduling has been removed; corresponding arguments
|
||||
# are no longer supported.
|
||||
scheduler_group.add_argument("--scheduling-policy",
|
||||
**scheduler_kwargs["policy"])
|
||||
scheduler_group.add_argument(
|
||||
@@ -1257,28 +1252,11 @@ class EngineArgs:
|
||||
disable_log_stats=self.disable_log_stats,
|
||||
)
|
||||
|
||||
# Reminder: Please update docs/features/compatibility_matrix.md
|
||||
# If the feature combo become valid
|
||||
if self.num_scheduler_steps > 1:
|
||||
if speculative_config is not None:
|
||||
raise ValueError("Speculative decoding is not supported with "
|
||||
"multi-step (--num-scheduler-steps > 1)")
|
||||
if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
|
||||
raise ValueError("Multi-Step Chunked-Prefill is not supported "
|
||||
"for pipeline-parallel-size > 1")
|
||||
if current_platform.is_cpu():
|
||||
logger.warning("Multi-Step (--num-scheduler-steps > 1) is "
|
||||
"currently not supported for CPUs and has been "
|
||||
"disabled.")
|
||||
self.num_scheduler_steps = 1
|
||||
|
||||
# make sure num_lookahead_slots is set the higher value depending on
|
||||
# if we are using speculative decoding or multi-step
|
||||
num_lookahead_slots = max(self.num_lookahead_slots,
|
||||
self.num_scheduler_steps - 1)
|
||||
num_lookahead_slots = num_lookahead_slots \
|
||||
if speculative_config is None \
|
||||
else speculative_config.num_lookahead_slots
|
||||
# make sure num_lookahead_slots is set appropriately depending on
|
||||
# whether speculative decoding is enabled
|
||||
num_lookahead_slots = self.num_lookahead_slots
|
||||
if speculative_config is not None:
|
||||
num_lookahead_slots = speculative_config.num_lookahead_slots
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
runner_type=model_config.runner_type,
|
||||
@@ -1292,8 +1270,6 @@ class EngineArgs:
|
||||
disable_chunked_mm_input=self.disable_chunked_mm_input,
|
||||
is_multimodal_model=model_config.is_multimodal_model,
|
||||
preemption_mode=self.preemption_mode,
|
||||
num_scheduler_steps=self.num_scheduler_steps,
|
||||
multi_step_stream_outputs=self.multi_step_stream_outputs,
|
||||
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
and parallel_config.use_ray),
|
||||
policy=self.scheduling_policy,
|
||||
@@ -1392,11 +1368,6 @@ class EngineArgs:
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if self.num_scheduler_steps != SchedulerConfig.num_scheduler_steps:
|
||||
_raise_or_fallback(feature_name="--num-scheduler-steps",
|
||||
recommend_to_remove=True)
|
||||
return False
|
||||
|
||||
if self.scheduler_delay_factor != SchedulerConfig.delay_factor:
|
||||
_raise_or_fallback(feature_name="--scheduler-delay-factor",
|
||||
recommend_to_remove=True)
|
||||
|
||||
Reference in New Issue
Block a user