[BugFix] Priority scheduling and spec tokens preemption (#28558)

Signed-off-by: Andy Lo <andy@mistral.ai>
This commit is contained in:
Andy Lo
2025-11-12 20:29:21 +00:00
committed by GitHub
parent 94a9ebcf31
commit 58ce8d12b7
2 changed files with 266 additions and 0 deletions

View File

@@ -300,6 +300,20 @@ class Scheduler(SchedulerInterface):
]
req_to_new_blocks.pop(preempted_req.request_id)
num_scheduled_tokens.pop(preempted_req.request_id)
scheduled_spec_decode_tokens.pop(
preempted_req.request_id, None
)
preempted_encoder_inputs = scheduled_encoder_inputs.pop(
preempted_req.request_id, None
)
if preempted_encoder_inputs:
# Restore encoder compute budget if the preempted
# request had encoder inputs scheduled in this step.
num_tokens_to_restore = sum(
preempted_req.get_num_encoder_tokens(i)
for i in preempted_encoder_inputs
)
encoder_compute_budget += num_tokens_to_restore
req_index -= 1
else:
preempted_req = self.running.pop()