[Dynamic Spec Decoding] Auto-disable by the running queue size (#4592)

Co-authored-by: Cade Daniel <edacih@gmail.com>
This commit is contained in:
Cody Yu
2024-05-08 14:44:00 -07:00
committed by GitHub
parent 89579a201f
commit f942efb5a3
11 changed files with 227 additions and 39 deletions

View File

@@ -83,6 +83,7 @@ class EngineArgs:
speculative_model: Optional[str] = None
num_speculative_tokens: Optional[int] = None
speculative_max_model_len: Optional[int] = None
speculative_disable_by_batch_size: Optional[int] = None
ngram_prompt_lookup_max: Optional[int] = None
ngram_prompt_lookup_min: Optional[int] = None
@@ -467,6 +468,13 @@ class EngineArgs:
'draft model. Sequences over this length will skip '
'speculation.')
parser.add_argument(
'--speculative-disable-by-batch-size',
type=int,
default=EngineArgs.speculative_disable_by_batch_size,
help='Disable speculative decoding for new incoming requests '
'if the number of enqueue requests is larger than this value.')
parser.add_argument(
'--ngram-prompt-lookup-max',
type=int,
@@ -547,6 +555,8 @@ class EngineArgs:
target_dtype=self.dtype,
speculative_model=self.speculative_model,
num_speculative_tokens=self.num_speculative_tokens,
speculative_disable_by_batch_size=self.
speculative_disable_by_batch_size,
speculative_max_model_len=self.speculative_max_model_len,
enable_chunked_prefill=self.enable_chunked_prefill,
use_v2_block_manager=self.use_v2_block_manager,