[Bugfix][V1] GPUModelRunner._update_states should return True when there is a finished request in batch (#13126)

2025-02-14 14:39:20 +08:00
parent ba59b78a9c
commit b0ccfc565a
2 changed files with 238 additions and 1 deletions
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -363,7 +363,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # Condense the batched states if there are empty indices.
        if removed_req_indices:
            self.input_batch.condense(removed_req_indices)
-        return len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+        return (len(unscheduled_req_ids) > 0 or len(req_ids_to_add) > 0
+                or len(scheduler_output.finished_req_ids) > 0)

    def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
        total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens