[Core] Eliminate parallel worker per-step task scheduling overhead (#4894)
This commit is contained in:
@@ -234,6 +234,14 @@ class _AsyncLLMEngine(LLMEngine):
|
||||
# Log stats.
|
||||
self.do_log_stats(scheduler_outputs, output)
|
||||
|
||||
if not request_outputs:
|
||||
# Stop the execute model loop in parallel workers until there are
|
||||
# more requests to process. This avoids waiting indefinitely in
|
||||
# torch.distributed ops which may otherwise timeout, and unblocks
|
||||
# the RPC thread in the workers so that they can process any other
|
||||
# queued control plane messages, such as add/remove lora adapters.
|
||||
await self.model_executor.stop_remote_worker_execution_loop_async()
|
||||
|
||||
return request_outputs
|
||||
|
||||
async def encode_request_async(
|
||||
@@ -687,7 +695,7 @@ class AsyncLLMEngine:
|
||||
multi_modal_data: Multi modal data per request.
|
||||
|
||||
Yields:
|
||||
The output `EmbeddingRequestOutput` objects from the LLMEngine
|
||||
The output `EmbeddingRequestOutput` objects from the LLMEngine
|
||||
for the request.
|
||||
|
||||
Details:
|
||||
|
||||
Reference in New Issue
Block a user