[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)
Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
committed by
GitHub
parent
4e67a8f616
commit
f7008ce1c4
@@ -466,6 +466,18 @@ class EngineCore:
|
||||
# in a field and do it immediately once step_with_batch_queue is
|
||||
# re-called. The latter slightly favors TTFT over TPOT/throughput.
|
||||
if deferred_scheduler_output:
|
||||
# If we are doing speculative decoding with structured output,
|
||||
# we need to get the draft token ids from the prior step before
|
||||
# we can compute the grammar bitmask for the deferred request.
|
||||
if self.use_spec_decode:
|
||||
draft_token_ids = self.model_executor.take_draft_token_ids()
|
||||
assert draft_token_ids is not None
|
||||
# Update the draft token ids in the scheduler output to
|
||||
# filter out the invalid spec tokens, which will be padded
|
||||
# with -1 and skipped by the grammar bitmask computation.
|
||||
self.scheduler.update_draft_token_ids_in_output(
|
||||
draft_token_ids, deferred_scheduler_output
|
||||
)
|
||||
# We now have the tokens needed to compute the bitmask for the
|
||||
# deferred request. Get the bitmask and call sample tokens.
|
||||
grammar_output = self.scheduler.get_grammar_bitmask(
|
||||
|
||||
@@ -158,12 +158,11 @@ class InputProcessor:
|
||||
or params.presence_penalty != 0.0
|
||||
or params.repetition_penalty != 1.0
|
||||
or params.bad_words_token_ids
|
||||
or params.structured_outputs
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
"async scheduling with spec decoding doesn't yet support "
|
||||
"penalties, bad words or structured outputs in sampling parameters."
|
||||
"penalties or bad words in sampling parameters."
|
||||
)
|
||||
|
||||
def _validate_params(
|
||||
|
||||
Reference in New Issue
Block a user