[Bugfix][V1] GPUModelRunner._update_states should return True when there is a finished request in batch (#13126)

This commit is contained in:
Kero Liang
2025-02-14 14:39:20 +08:00
committed by GitHub
parent ba59b78a9c
commit b0ccfc565a
2 changed files with 238 additions and 1 deletions

View File

@@ -363,7 +363,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
# Condense the batched states if there are empty indices.
if removed_req_indices:
self.input_batch.condense(removed_req_indices)
return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
return (len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
or len(scheduler_output.finished_req_ids) > 0)
def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens