[Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-09-27 16:32:07 -04:00
parent c5d55356f9
commit c2ec430ab5
19 changed files with 513 additions and 108 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -980,9 +980,13 @@ class EngineArgs:
            if speculative_config is not None:
                raise ValueError("Speculative decoding is not supported with "
                                 "multi-step (--num-scheduler-steps > 1)")
-            if self.enable_chunked_prefill:
-                raise ValueError("Chunked prefill is not supported with "
-                                 "multi-step (--num-scheduler-steps > 1)")
+            if self.enable_chunked_prefill and self.enable_prefix_caching:
+                raise ValueError("Multi-Step is not supported with "
+                                 "both Chunked-Prefill and Prefix-Caching "
+                                 "enabled together.")
+            if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
+                raise ValueError("Multi-Step Chunked-Prefill is not supported "
+                                 "for pipeline-parallel-size > 1")

        # make sure num_lookahead_slots is set the higher value depending on
        # if we are using speculative decoding or multi-step