[Bug][Frontend] Improve ZMQ client robustness (#7443)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2024-08-21 20:18:11 -06:00
parent df1a21131d
commit cde9183b40
6 changed files with 176 additions and 28 deletions
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -56,6 +56,7 @@ if TYPE_CHECKING:
    VERBOSE: bool = False
    VLLM_ALLOW_LONG_MAX_MODEL_LEN: bool = False
    VLLM_TEST_FORCE_FP8_MARLIN: bool = False
+    VLLM_RPC_GET_DATA_TIMEOUT_MS: int = 5000
    VLLM_ALLOW_ENGINE_USE_RAY: bool = False
    VLLM_PLUGINS: Optional[List[str]] = None
    VLLM_TORCH_PROFILER_DIR: Optional[str] = None
@@ -374,6 +375,11 @@ environment_variables: Dict[str, Callable[[], Any]] = {
    (os.environ.get("VLLM_TEST_FORCE_FP8_MARLIN", "0").strip().lower() in
     ("1", "true")),

+    # Time in ms for the zmq client to wait for a response from the backend
+    # server for simple data operations
+    "VLLM_RPC_GET_DATA_TIMEOUT_MS":
+    lambda: int(os.getenv("VLLM_RPC_GET_DATA_TIMEOUT_MS", "5000")),
+
    # If set, allow running the engine as a separate ray actor,
    # which is a deprecated feature soon to be removed.
    # See https://github.com/vllm-project/vllm/issues/7045