[Core] Reduce TTFT with concurrent partial prefills (#10235)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com> Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
This commit is contained in:
@@ -120,6 +120,9 @@ class EngineArgs:
|
||||
cpu_offload_gb: float = 0 # GiB
|
||||
gpu_memory_utilization: float = 0.90
|
||||
max_num_batched_tokens: Optional[int] = None
|
||||
max_num_partial_prefills: Optional[int] = 1
|
||||
max_long_partial_prefills: Optional[int] = 1
|
||||
long_prefill_token_threshold: Optional[int] = 0
|
||||
max_num_seqs: Optional[int] = None
|
||||
max_logprobs: int = 20 # Default value for OpenAI Chat Completions API
|
||||
disable_log_stats: bool = False
|
||||
@@ -515,6 +518,31 @@ class EngineArgs:
|
||||
default=EngineArgs.max_num_batched_tokens,
|
||||
help='Maximum number of batched tokens per '
|
||||
'iteration.')
|
||||
parser.add_argument(
|
||||
"--max-num-partial-prefills",
|
||||
type=int,
|
||||
default=EngineArgs.max_num_partial_prefills,
|
||||
help="For chunked prefill, the max number of concurrent \
|
||||
partial prefills."
|
||||
"Defaults to 1",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--max-long-partial-prefills",
|
||||
type=int,
|
||||
default=EngineArgs.max_long_partial_prefills,
|
||||
help="For chunked prefill, the maximum number of prompts longer "
|
||||
"than --long-prefill-token-threshold that will be prefilled "
|
||||
"concurrently. Setting this less than --max-num-partial-prefills "
|
||||
"will allow shorter prompts to jump the queue in front of longer "
|
||||
"prompts in some cases, improving latency. Defaults to 1.")
|
||||
parser.add_argument(
|
||||
"--long-prefill-token-threshold",
|
||||
type=float,
|
||||
default=EngineArgs.long_prefill_token_threshold,
|
||||
help="For chunked prefill, a request is considered long if the "
|
||||
"prompt is longer than this number of tokens. Defaults to 4%% of "
|
||||
"the model's context length.",
|
||||
)
|
||||
parser.add_argument('--max-num-seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
@@ -1244,7 +1272,11 @@ class EngineArgs:
|
||||
multi_step_stream_outputs=self.multi_step_stream_outputs,
|
||||
send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
|
||||
and parallel_config.use_ray),
|
||||
policy=self.scheduling_policy)
|
||||
policy=self.scheduling_policy,
|
||||
max_num_partial_prefills=self.max_num_partial_prefills,
|
||||
max_long_partial_prefills=self.max_long_partial_prefills,
|
||||
long_prefill_token_threshold=self.long_prefill_token_threshold,
|
||||
)
|
||||
lora_config = LoRAConfig(
|
||||
bias_enabled=self.enable_lora_bias,
|
||||
max_lora_rank=self.max_lora_rank,
|
||||
|
||||
Reference in New Issue
Block a user