[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Benjamin Chislett
2026-01-06 13:50:37 -05:00
committed by GitHub
parent 4e67a8f616
commit f7008ce1c4
8 changed files with 185 additions and 55 deletions

View File

@@ -466,6 +466,18 @@ class EngineCore:
# in a field and do it immediately once step_with_batch_queue is
# re-called. The latter slightly favors TTFT over TPOT/throughput.
if deferred_scheduler_output:
# If we are doing speculative decoding with structured output,
# we need to get the draft token ids from the prior step before
# we can compute the grammar bitmask for the deferred request.
if self.use_spec_decode:
draft_token_ids = self.model_executor.take_draft_token_ids()
assert draft_token_ids is not None
# Update the draft token ids in the scheduler output to
# filter out the invalid spec tokens, which will be padded
# with -1 and skipped by the grammar bitmask computation.
self.scheduler.update_draft_token_ids_in_output(
draft_token_ids, deferred_scheduler_output
)
# We now have the tokens needed to compute the bitmask for the
# deferred request. Get the bitmask and call sample tokens.
grammar_output = self.scheduler.get_grammar_bitmask(

View File

@@ -158,12 +158,11 @@ class InputProcessor:
or params.presence_penalty != 0.0
or params.repetition_penalty != 1.0
or params.bad_words_token_ids
or params.structured_outputs
)
):
raise ValueError(
"async scheduling with spec decoding doesn't yet support "
"penalties, bad words or structured outputs in sampling parameters."
"penalties or bad words in sampling parameters."
)
def _validate_params(