[Core] Async Scheduling X Spec Decoding Compatibility (#24799)

Signed-off-by: Ronald1995 <ronaldautomobile@163.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
This commit is contained in:
Ronald
2025-11-18 04:16:20 +08:00
committed by GitHub
parent f8b19c0ffd
commit d8874c61a5
11 changed files with 314 additions and 98 deletions

View File

@@ -348,7 +348,10 @@ class Scheduler(SchedulerInterface):
# Speculative decode related.
if request.spec_token_ids:
num_scheduled_spec_tokens = (
num_new_tokens + request.num_computed_tokens - request.num_tokens
num_new_tokens
+ request.num_computed_tokens
- request.num_tokens
- request.num_output_placeholders
)
if num_scheduled_spec_tokens > 0:
# Trim spec_token_ids list to num_scheduled_spec_tokens.
@@ -1024,7 +1027,12 @@ class Scheduler(SchedulerInterface):
# tokens and rejections. If some tokens are rejected,
# num_computed_tokens is decreased by the number of rejected
# tokens.
request.num_computed_tokens -= num_rejected
if request.num_computed_tokens > 0:
request.num_computed_tokens -= num_rejected
# If async scheduling, num_output_placeholders also includes
# the scheduled spec tokens count and so is similarly adjusted.
if request.num_output_placeholders > 0:
request.num_output_placeholders -= num_rejected
spec_decoding_stats = self.make_spec_decoding_stats(
spec_decoding_stats,
num_draft_tokens=num_draft_tokens,