[Core] Eliminate parallel worker per-step task scheduling overhead (#4894)

2024-05-22 14:17:27 -07:00
parent 97b030005c
commit eb6d3c264d
12 changed files with 350 additions and 211 deletions
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -234,6 +234,14 @@ class _AsyncLLMEngine(LLMEngine):
        # Log stats.
        self.do_log_stats(scheduler_outputs, output)

+        if not request_outputs:
+            # Stop the execute model loop in parallel workers until there are
+            # more requests to process. This avoids waiting indefinitely in
+            # torch.distributed ops which may otherwise timeout, and unblocks
+            # the RPC thread in the workers so that they can process any other
+            # queued control plane messages, such as add/remove lora adapters.
+            await self.model_executor.stop_remote_worker_execution_loop_async()
+
        return request_outputs

    async def encode_request_async(
@@ -687,7 +695,7 @@ class AsyncLLMEngine:
            multi_modal_data: Multi modal data per request.

        Yields:
-            The output `EmbeddingRequestOutput` objects from the LLMEngine 
+            The output `EmbeddingRequestOutput` objects from the LLMEngine
            for the request.

        Details: