[Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
Varun Sundar Rabindranath
2024-09-27 16:32:07 -04:00
committed by GitHub
parent c5d55356f9
commit c2ec430ab5
19 changed files with 513 additions and 108 deletions

View File

@@ -980,9 +980,13 @@ class EngineArgs:
if speculative_config is not None:
raise ValueError("Speculative decoding is not supported with "
"multi-step (--num-scheduler-steps > 1)")
if self.enable_chunked_prefill:
raise ValueError("Chunked prefill is not supported with "
"multi-step (--num-scheduler-steps > 1)")
if self.enable_chunked_prefill and self.enable_prefix_caching:
raise ValueError("Multi-Step is not supported with "
"both Chunked-Prefill and Prefix-Caching "
"enabled together.")
if self.enable_chunked_prefill and self.pipeline_parallel_size > 1:
raise ValueError("Multi-Step Chunked-Prefill is not supported "
"for pipeline-parallel-size > 1")
# make sure num_lookahead_slots is set the higher value depending on
# if we are using speculative decoding or multi-step