[Perf] Async Scheduling + Speculative Decoding + Structured Outputs (#29821)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-01-06 13:50:37 -05:00
parent 4e67a8f616
commit f7008ce1c4
8 changed files with 185 additions and 55 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -466,6 +466,18 @@ class EngineCore:
        # in a field and do it immediately once step_with_batch_queue is
        # re-called. The latter slightly favors TTFT over TPOT/throughput.
        if deferred_scheduler_output:
+            # If we are doing speculative decoding with structured output,
+            # we need to get the draft token ids from the prior step before
+            # we can compute the grammar bitmask for the deferred request.
+            if self.use_spec_decode:
+                draft_token_ids = self.model_executor.take_draft_token_ids()
+                assert draft_token_ids is not None
+                # Update the draft token ids in the scheduler output to
+                # filter out the invalid spec tokens, which will be padded
+                # with -1 and skipped by the grammar bitmask computation.
+                self.scheduler.update_draft_token_ids_in_output(
+                    draft_token_ids, deferred_scheduler_output
+                )
            # We now have the tokens needed to compute the bitmask for the
            # deferred request. Get the bitmask and call sample tokens.
            grammar_output = self.scheduler.get_grammar_bitmask(
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -158,12 +158,11 @@ class InputProcessor:
                or params.presence_penalty != 0.0
                or params.repetition_penalty != 1.0
                or params.bad_words_token_ids
-                or params.structured_outputs
            )
        ):
            raise ValueError(
                "async scheduling with spec decoding doesn't yet support "
-                "penalties, bad words or structured outputs in sampling parameters."
+                "penalties or bad words in sampling parameters."
            )

    def _validate_params(