diff --git a/vllm/config/scheduler.py b/vllm/config/scheduler.py index 8abbe8ba0..781d13c69 100644 --- a/vllm/config/scheduler.py +++ b/vllm/config/scheduler.py @@ -130,11 +130,12 @@ class SchedulerConfig: and starting configuration. """ - async_scheduling: bool = False - """If set to True, perform async scheduling. This helps to avoid gaps in - GPU utilization, leading to better latency and throughput. - Async scheduling is currently not supported with some features such as - speculative decoding and pipeline parallelism. + async_scheduling: bool = Field(default=None) + """If set to False, disable async scheduling. Async scheduling helps to + avoid gaps in GPU utilization, leading to better latency and throughput. + It is currently not supported with some features such as + speculative decoding and pipeline parallelism, and will be automatically + disabled in those cases. """ stream_interval: int = Field(default=1, ge=1) diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 70319f98f..8e6cfb826 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -552,7 +552,7 @@ class VllmConfig: if self.speculative_config.method not in get_args(EagleModelTypes): raise ValueError( "Currently, async scheduling is only supported " - "with EAGLE/MTP kind of speculative decoding" + "with EAGLE/MTP kind of speculative decoding." ) if self.speculative_config.disable_padded_drafter_batch: raise ValueError( @@ -570,16 +570,27 @@ class VllmConfig: ) elif self.scheduler_config.async_scheduling is None: # Enable async scheduling unless there is an incompatible option. - # NOTE: we won't reach here until async scheduling is enabled by default. - if ( - self.parallel_config.pipeline_parallel_size > 1 - or self.speculative_config is not None - ): + if self.parallel_config.pipeline_parallel_size > 1: logger.warning( - "Async scheduling is not yet supported with speculative decoding " - " or pipeline_parallel_size > 1 and will be disabled." + "Async scheduling is not yet supported with " + "pipeline_parallel_size > 1 and will be disabled." ) self.scheduler_config.async_scheduling = False + elif self.speculative_config is not None: + if self.speculative_config.method not in get_args(EagleModelTypes): + logger.warning( + "Async scheduling not supported with %s-based " + "speculative decoding and will be disabled.", + self.speculative_config.method, + ) + else: + logger.warning( + "Async scheduling will be disabled because some features do " + "not currently work in conjunction with speculative decoding. " + "To use async scheduling with spec decoding anyway, " + "enable it explicitly via async_scheduling=True." + ) + self.scheduler_config.async_scheduling = False elif not executor_supports_async_sched: logger.warning( "Async scheduling will be disabled because it is not supported " @@ -595,11 +606,16 @@ class VllmConfig: self.scheduler_config.async_scheduling and not self.parallel_config.disable_nccl_for_dp_synchronization ): - logger.info( + logger.info_once( "Disabling NCCL for DP synchronization when using async scheduling." ) self.parallel_config.disable_nccl_for_dp_synchronization = True + logger.info_once( + "Asynchronous scheduling is %s.", + "enabled" if self.scheduler_config.async_scheduling else "disabled", + ) + from vllm.platforms import current_platform if (