[V1] Eagerly remove finished requests from the batch (#14388)

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-07 10:56:00 -08:00
parent c6359e8ca6
commit 8ed5421aaa
9 changed files with 58 additions and 16 deletions
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -253,13 +253,14 @@ class AsyncLLM(EngineClient):
            while True:
                # 1) Pull EngineCoreOutputs from the EngineCore.
                outputs = await self.engine_core.get_output_async()
+                num_outputs = len(outputs.outputs)

-                iteration_stats = IterationStats() if self.log_stats else None
+                iteration_stats = IterationStats() if (
+                    self.log_stats and num_outputs) else None

                # Split outputs into chunks of at most
                # VLLM_V1_OUTPUT_PROC_CHUNK_SIZE, so that we don't block the
                # event loop for too long.
-                num_outputs = len(outputs.outputs)
                if num_outputs <= VLLM_V1_OUTPUT_PROC_CHUNK_SIZE:
                    slices = (outputs.outputs, )
                else:
@@ -313,7 +314,6 @@ class AsyncLLM(EngineClient):
            return

        assert scheduler_stats is not None
-        assert iteration_stats is not None
        for stat_logger in self.stat_loggers:
            stat_logger.record(scheduler_stats=scheduler_stats,
                               iteration_stats=iteration_stats)