[Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2024-09-27 16:32:07 -04:00
parent c5d55356f9
commit c2ec430ab5
19 changed files with 513 additions and 108 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -983,9 +983,16 @@ class SchedulerConfig:
                 policy: str = "fcfs") -> None:
        if max_num_batched_tokens is None:
            if enable_chunked_prefill:
-                # It is the values that have the best balance between ITL
-                # and TTFT on A100. Note it is not optimized for throughput.
-                max_num_batched_tokens = 512
+                if num_scheduler_steps > 1:
+                    # Multi-step Chunked-Prefill doesn't allow prompt-chunking
+                    # for now. Have max_num_batched_tokens set to max_model_len
+                    # so we don't reject sequences on account of a short
+                    # max_num_batched_tokens.
+                    max_num_batched_tokens = max(max_model_len, 2048)
+                else:
+                    # It is the values that have the best balance between ITL
+                    # and TTFT on A100. Note it is not optimized for throughput.
+                    max_num_batched_tokens = 512
            else:
                # If max_model_len is too short, use 2048 as the default value
                # for higher throughput.