Fix middleware: handle SGLang startup lag gracefully

- Add /health endpoint that returns 503 until SGLang is ready - Background task polls SGLang until it accepts connections - Catch ConnectError/TimeoutException instead of crashing - Return 503 JSON error when SGLang backend is unavailable - haproxy health-checks middleware /health, which reflects SGLang state
2026-04-12 19:06:38 +00:00
parent bbe40ac8c0
commit db9231f796
1 changed files with 71 additions and 23 deletions
--- a/vllm_middleware.py
+++ b/vllm_middleware.py
@@ -8,7 +8,7 @@ Currently strips: logprobs, top_logprobs
 (SGLang's Mistral tool-call parser rejects these; vLLM accepts them.)

 Architecture:
-  haproxy (original port) → middleware (port+2) → SGLang (port+1)
+  haproxy (port N) → middleware (port N+2) → SGLang (port N+1)

 haproxy still handles /metrics stub and /health instant responses.
 This middleware only touches the proxied request bodies.
@@ -16,11 +16,13 @@ This middleware only touches the proxied request bodies.

 import json
 import os
+import asyncio
 import httpx
 from fastapi import FastAPI, Request
 from fastapi.responses import StreamingResponse, Response
 import uvicorn

+SGLANG_HOST = os.environ.get("SGLANG_HOST", "127.0.0.1")
 SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001"))
 LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002"))

@@ -30,15 +32,35 @@ STRIP_PARAMS = {"logprobs", "top_logprobs"}

 app = FastAPI()
 client: httpx.AsyncClient | None = None
+_sglang_ready = False


@app.on_event("startup")
 async def startup():
    global client
    client = httpx.AsyncClient(
-        base_url=f"http://127.0.0.1:{SGLANG_PORT}",
-        timeout=httpx.Timeout(300.0),
+        timeout=httpx.Timeout(300.0, connect=10.0),
    )
+    # Background task: wait for SGLang to become available
+    asyncio.create_task(_wait_for_sglang())
+
+
+async def _wait_for_sglang():
+    """Poll SGLang until it's accepting connections, then mark ready."""
+    global _sglang_ready
+    while True:
+        try:
+            resp = await client.get(
+                f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
+                timeout=httpx.Timeout(5.0, connect=2.0),
+            )
+            if resp.status_code == 200:
+                _sglang_ready = True
+                print(f"Middleware: SGLang is ready at {SGLANG_HOST}:{SGLANG_PORT}")
+                return
+        except (httpx.ConnectError, httpx.TimeoutException):
+            pass
+        await asyncio.sleep(2)


@app.on_event("shutdown")
@@ -46,6 +68,25 @@ async def shutdown():
    await client.aclose()


+@app.get("/health")
+async def health():
+    """Health check — haproxy polls this. Returns 200 only if SGLang is up."""
+    if not _sglang_ready:
+        return Response(content="SGLang not ready", status_code=503)
+    try:
+        resp = await client.get(
+            f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
+            timeout=httpx.Timeout(5.0, connect=2.0),
+        )
+        return Response(content=resp.content, status_code=resp.status_code,
+                        media_type=resp.headers.get("content-type"))
+    except (httpx.ConnectError, httpx.TimeoutException):
+        _sglang_ready = False
+        # Re-trigger background wait
+        asyncio.create_task(_wait_for_sglang())
+        return Response(content="SGLang not ready", status_code=503)
+
+
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
 async def proxy(path: str, request: Request):
    body = await request.body()
@@ -73,10 +114,11 @@ async def proxy(path: str, request: Request):
    }
    fwd_headers["content-length"] = str(len(body))

-    url = f"http://127.0.0.1:{SGLANG_PORT}/{path}"
+    url = f"http://{SGLANG_HOST}:{SGLANG_PORT}/{path}"
    if request.query_params:
        url += f"?{request.query_params}"

+    try:
        if is_streaming:
            req = client.build_request(request.method, url, content=body, headers=fwd_headers)
            resp = await client.send(req, stream=True)
@@ -100,6 +142,12 @@ async def proxy(path: str, request: Request):
                status_code=resp.status_code,
                media_type=resp.headers.get("content-type"),
            )
+    except (httpx.ConnectError, httpx.TimeoutException) as e:
+        return Response(
+            content=json.dumps({"error": {"message": f"SGLang backend unavailable: {e}", "type": "backend_error"}}),
+            status_code=503,
+            media_type="application/json",
+        )


 if __name__ == "__main__":