[BugFix] Handle unscheduled requests properly when async scheduling (#27756)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -71,6 +71,7 @@ class Scheduler(SchedulerInterface):
|
||||
self.finished_req_ids_dict: dict[int, set[str]] | None = (
|
||||
defaultdict(set) if include_finished_set else None
|
||||
)
|
||||
self.prev_step_scheduled_req_ids: set[str] = set()
|
||||
|
||||
# Scheduling constraints.
|
||||
self.max_num_running_reqs = self.scheduler_config.max_num_seqs
|
||||
@@ -444,14 +445,9 @@ class Scheduler(SchedulerInterface):
|
||||
# `request.num_prompt_tokens` to consider the resumed
|
||||
# requests, which have output tokens.
|
||||
num_new_tokens = request.num_tokens - num_computed_tokens
|
||||
if (
|
||||
0
|
||||
< self.scheduler_config.long_prefill_token_threshold
|
||||
< num_new_tokens
|
||||
):
|
||||
num_new_tokens = (
|
||||
self.scheduler_config.long_prefill_token_threshold
|
||||
)
|
||||
threshold = self.scheduler_config.long_prefill_token_threshold
|
||||
if 0 < threshold < num_new_tokens:
|
||||
num_new_tokens = threshold
|
||||
|
||||
# chunked prefill has to be enabled explicitly to allow
|
||||
# pooling requests to be chunked
|
||||
@@ -620,6 +616,11 @@ class Scheduler(SchedulerInterface):
|
||||
structured_output_request_ids, grammar_bitmask = self.get_grammar_bitmask(
|
||||
num_scheduled_tokens.keys(), scheduled_spec_decode_tokens
|
||||
)
|
||||
|
||||
# Record the request ids that were scheduled in this step.
|
||||
self.prev_step_scheduled_req_ids.clear()
|
||||
self.prev_step_scheduled_req_ids.update(num_scheduled_tokens.keys())
|
||||
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=new_reqs_data,
|
||||
scheduled_cached_reqs=cached_reqs_data,
|
||||
@@ -691,14 +692,12 @@ class Scheduler(SchedulerInterface):
|
||||
req_ids: list[str] = []
|
||||
new_token_ids: list[list[int]] = []
|
||||
new_block_ids: list[tuple[list[int], ...] | None] = []
|
||||
resumed_req_token_ids: list[list[int] | None] = []
|
||||
all_token_ids: dict[str, list[int]] = {}
|
||||
num_computed_tokens: list[int] = []
|
||||
num_output_tokens: list[int] = []
|
||||
resumed_req_ids = set()
|
||||
|
||||
# Because resumed_reqs is usually empty, it is more efficient to do
|
||||
# in-place appending so that we don't need to allocate a new list.
|
||||
resumed_from_preemption = [False] * len(running_reqs)
|
||||
resumed_from_preemption += [True] * len(resumed_reqs)
|
||||
num_running_reqs = len(running_reqs)
|
||||
for idx, req in enumerate(itertools.chain(running_reqs, resumed_reqs)):
|
||||
req_id = req.request_id
|
||||
req_ids.append(req_id)
|
||||
@@ -715,12 +714,14 @@ class Scheduler(SchedulerInterface):
|
||||
req.num_computed_tokens : req.num_computed_tokens + num_tokens
|
||||
]
|
||||
new_token_ids.append(token_ids)
|
||||
resumed_token_ids = None
|
||||
if resumed_from_preemption[idx]:
|
||||
resumed_token_ids = req.all_token_ids[
|
||||
scheduled_in_prev_step = req_id in self.prev_step_scheduled_req_ids
|
||||
if idx >= num_running_reqs:
|
||||
assert not scheduled_in_prev_step
|
||||
resumed_req_ids.add(req_id)
|
||||
if not scheduled_in_prev_step:
|
||||
all_token_ids[req_id] = req.all_token_ids[
|
||||
: req.num_computed_tokens + num_tokens
|
||||
]
|
||||
resumed_req_token_ids.append(resumed_token_ids)
|
||||
new_block_ids.append(
|
||||
req_to_new_blocks[req_id].get_block_ids(allow_none=True)
|
||||
)
|
||||
@@ -731,9 +732,9 @@ class Scheduler(SchedulerInterface):
|
||||
|
||||
return CachedRequestData(
|
||||
req_ids=req_ids,
|
||||
resumed_from_preemption=resumed_from_preemption,
|
||||
resumed_req_ids=resumed_req_ids,
|
||||
new_token_ids=new_token_ids,
|
||||
resumed_req_token_ids=resumed_req_token_ids,
|
||||
all_token_ids=all_token_ids,
|
||||
new_block_ids=new_block_ids,
|
||||
num_computed_tokens=num_computed_tokens,
|
||||
num_output_tokens=num_output_tokens,
|
||||
|
||||
Reference in New Issue
Block a user