[BugFix] Handle unscheduled requests properly when async scheduling (#27756)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-10-29 21:04:25 -07:00
committed by GitHub
parent b5bae42f91
commit 2ce5c5d3d6
9 changed files with 63 additions and 43 deletions

View File

@@ -71,6 +71,7 @@ class Scheduler(SchedulerInterface):
self.finished_req_ids_dict: dict[int, set[str]] | None = (
defaultdict(set) if include_finished_set else None
)
self.prev_step_scheduled_req_ids: set[str] = set()
# Scheduling constraints.
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
@@ -444,14 +445,9 @@ class Scheduler(SchedulerInterface):
# `request.num_prompt_tokens` to consider the resumed
# requests, which have output tokens.
num_new_tokens = request.num_tokens - num_computed_tokens
if (
0
< self.scheduler_config.long_prefill_token_threshold
< num_new_tokens
):
num_new_tokens = (
self.scheduler_config.long_prefill_token_threshold
)
threshold = self.scheduler_config.long_prefill_token_threshold
if 0 < threshold < num_new_tokens:
num_new_tokens = threshold
# chunked prefill has to be enabled explicitly to allow
# pooling requests to be chunked
@@ -620,6 +616,11 @@ class Scheduler(SchedulerInterface):
structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask(
num_scheduled_tokens.keys(), scheduled_spec_decode_tokens
)
# Record the request ids that were scheduled in this step.
self.prev_step_scheduled_req_ids.clear()
self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
scheduler_output = SchedulerOutput(
scheduled_new_reqs=new_reqs_data,
scheduled_cached_reqs=cached_reqs_data,
@@ -691,14 +692,12 @@ class Scheduler(SchedulerInterface):
req_ids: list[str] = []
new_token_ids: list[list[int]] = []
new_block_ids: list[tuple[list[int], ...] | None] = []
resumed_req_token_ids: list[list[int] | None] = []
all_token_ids: dict[str, list[int]] = {}
num_computed_tokens: list[int] = []
num_output_tokens: list[int] = []
resumed_req_ids = set()
# Because resumed_reqs is usually empty, it is more efficient to do
# in-place appending so that we don't need to allocate a new list.
resumed_from_preemption = [False] * len(running_reqs)
resumed_from_preemption += [True] * len(resumed_reqs)
num_running_reqs = len(running_reqs)
for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
req_id = req.request_id
req_ids.append(req_id)
@@ -715,12 +714,14 @@ class Scheduler(SchedulerInterface):
req.num_computed_tokens : req.num_computed_tokens + num_tokens
]
new_token_ids.append(token_ids)
resumed_token_ids = None
if resumed_from_preemption[idx]:
resumed_token_ids = req.all_token_ids[
scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids
if idx >= num_running_reqs:
assert not scheduled_in_prev_step
resumed_req_ids.add(req_id)
if not scheduled_in_prev_step:
all_token_ids[req_id] = req.all_token_ids[
: req.num_computed_tokens + num_tokens
]
resumed_req_token_ids.append(resumed_token_ids)
new_block_ids.append(
req_to_new_blocks[req_id].get_block_ids(allow_none=True)
)
@@ -731,9 +732,9 @@ class Scheduler(SchedulerInterface):
return CachedRequestData(
req_ids=req_ids,
resumed_from_preemption=resumed_from_preemption,
resumed_req_ids=resumed_req_ids,
new_token_ids=new_token_ids,
resumed_req_token_ids=resumed_req_token_ids,
all_token_ids=all_token_ids,
new_block_ids=new_block_ids,
num_computed_tokens=num_computed_tokens,
num_output_tokens=num_output_tokens,