[Spec Decode] Efficient padded speculation (#24539)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
2025-09-18 01:07:24 -04:00
parent 5c65a72bb1
commit b7433ca1a4
5 changed files with 507 additions and 104 deletions
--- a/vllm/config/speculative.py
+++ b/vllm/config/speculative.py
@@ -83,6 +83,11 @@ class SpeculativeConfig:
    disable_by_batch_size: Optional[int] = None
    """Disable speculative decoding for new incoming requests when the number
    of enqueued requests is larger than this value, if provided."""
+    disable_padded_drafter_batch: bool = False
+    """Disable input padding for speculative decoding. If set to True,
+    speculative input batches can contain sequences of different lengths,
+    which may only be supported by certain attention backends. This currently
+    only affects the EAGLE method of speculation."""

    # Ngram proposer configuration
    prompt_lookup_max: Optional[int] = None