[Core] Async Scheduling X Spec Decoding Compatibility (#24799)
Signed-off-by: Ronald1995 <ronaldautomobile@163.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
This commit is contained in:
@@ -348,7 +348,10 @@ class Scheduler(SchedulerInterface):
|
||||
# Speculative decode related.
|
||||
if request.spec_token_ids:
|
||||
num_scheduled_spec_tokens = (
|
||||
num_new_tokens + request.num_computed_tokens - request.num_tokens
|
||||
num_new_tokens
|
||||
+ request.num_computed_tokens
|
||||
- request.num_tokens
|
||||
- request.num_output_placeholders
|
||||
)
|
||||
if num_scheduled_spec_tokens > 0:
|
||||
# Trim spec_token_ids list to num_scheduled_spec_tokens.
|
||||
@@ -1024,7 +1027,12 @@ class Scheduler(SchedulerInterface):
|
||||
# tokens and rejections. If some tokens are rejected,
|
||||
# num_computed_tokens is decreased by the number of rejected
|
||||
# tokens.
|
||||
request.num_computed_tokens -= num_rejected
|
||||
if request.num_computed_tokens > 0:
|
||||
request.num_computed_tokens -= num_rejected
|
||||
# If async scheduling, num_output_placeholders also includes
|
||||
# the scheduled spec tokens count and so is similarly adjusted.
|
||||
if request.num_output_placeholders > 0:
|
||||
request.num_output_placeholders -= num_rejected
|
||||
spec_decoding_stats = self.make_spec_decoding_stats(
|
||||
spec_decoding_stats,
|
||||
num_draft_tokens=num_draft_tokens,
|
||||
|
||||
Reference in New Issue
Block a user