[Core] Combined support for multi-step scheduling, chunked prefill & prefix caching (#8804)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
2024-10-02 03:52:20 -04:00
parent 1570203864
commit 563649aafe
3 changed files with 180 additions and 17 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -999,10 +999,6 @@ class EngineArgs:
            if speculative_config is not None:
                raise ValueError("Speculative decoding is not supported with "
                                 "multi-step (--num-scheduler-steps > 1)")
-            if self.enable_chunked_prefill and self.enable_prefix_caching:
-                raise ValueError("Multi-Step is not supported with "
-                                 "both Chunked-Prefill and Prefix-Caching "
-                                 "enabled together.")
            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
                raise ValueError("Multi-Step Chunked-Prefill is not supported "
                                 "for pipeline-parallel-size > 1")