[Core] Multi-Step + Single Step Prefills via Chunked Prefill code path (#8378)
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
c5d55356f9
commit
c2ec430ab5
@@ -983,9 +983,16 @@ class SchedulerConfig:
|
||||
policy: str = "fcfs") -> None:
|
||||
if max_num_batched_tokens is None:
|
||||
if enable_chunked_prefill:
|
||||
# It is the values that have the best balance between ITL
|
||||
# and TTFT on A100. Note it is not optimized for throughput.
|
||||
max_num_batched_tokens = 512
|
||||
if num_scheduler_steps > 1:
|
||||
# Multi-step Chunked-Prefill doesn't allow prompt-chunking
|
||||
# for now. Have max_num_batched_tokens set to max_model_len
|
||||
# so we don't reject sequences on account of a short
|
||||
# max_num_batched_tokens.
|
||||
max_num_batched_tokens = max(max_model_len, 2048)
|
||||
else:
|
||||
# It is the values that have the best balance between ITL
|
||||
# and TTFT on A100. Note it is not optimized for throughput.
|
||||
max_num_batched_tokens = 512
|
||||
else:
|
||||
# If max_model_len is too short, use 2048 as the default value
|
||||
# for higher throughput.
|
||||
|
||||
Reference in New Issue
Block a user