[Dynamic Spec Decoding] Auto-disable by the running queue size (#4592)

Co-authored-by: Cade Daniel <edacih@gmail.com>
2024-05-08 14:44:00 -07:00
parent 89579a201f
commit f942efb5a3
11 changed files with 227 additions and 39 deletions
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -83,6 +83,7 @@ class EngineArgs:
    speculative_model: Optional[str] = None
    num_speculative_tokens: Optional[int] = None
    speculative_max_model_len: Optional[int] = None
+    speculative_disable_by_batch_size: Optional[int] = None
    ngram_prompt_lookup_max: Optional[int] = None
    ngram_prompt_lookup_min: Optional[int] = None

@@ -467,6 +468,13 @@ class EngineArgs:
            'draft model. Sequences over this length will skip '
            'speculation.')

+        parser.add_argument(
+            '--speculative-disable-by-batch-size',
+            type=int,
+            default=EngineArgs.speculative_disable_by_batch_size,
+            help='Disable speculative decoding for new incoming requests '
+            'if the number of enqueue requests is larger than this value.')
+
        parser.add_argument(
            '--ngram-prompt-lookup-max',
            type=int,
@@ -547,6 +555,8 @@ class EngineArgs:
            target_dtype=self.dtype,
            speculative_model=self.speculative_model,
            num_speculative_tokens=self.num_speculative_tokens,
+            speculative_disable_by_batch_size=self.
+            speculative_disable_by_batch_size,
            speculative_max_model_len=self.speculative_max_model_len,
            enable_chunked_prefill=self.enable_chunked_prefill,
            use_v2_block_manager=self.use_v2_block_manager,