[Bugfix][Frontend] Fix Issues Under High Load With zeromq Frontend (#7394)

Co-authored-by: Nick Hill <nickhill@us.ibm.com>
2024-08-21 13:34:14 -04:00
parent d3c002eadc
commit f7e3b0c5aa
9 changed files with 322 additions and 141 deletions
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -135,6 +135,12 @@ async def build_async_engine_client(
        logger.info("Multiprocessing frontend to use %s for RPC Path.",
                    rpc_path)

+        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
+        # NOTE: Actually, this is not true yet. We still need to support
+        # embedding models via RPC (see TODO above)
+        rpc_client = AsyncEngineRPCClient(rpc_path)
+        async_engine_client = rpc_client  # type: ignore
+
        # Start RPCServer in separate process (holds the AsyncLLMEngine).
        context = multiprocessing.get_context("spawn")
        # the current process might have CUDA context,
@@ -145,11 +151,6 @@ async def build_async_engine_client(
        rpc_server_process.start()
        logger.info("Started engine process with PID %d",
                    rpc_server_process.pid)
-        # Build RPCClient, which conforms to AsyncEngineClient Protocol.
-        # NOTE: Actually, this is not true yet. We still need to support
-        # embedding models via RPC (see TODO above)
-        rpc_client = AsyncEngineRPCClient(rpc_path)
-        async_engine_client = rpc_client  # type: ignore

        try:
            while True: