Fix middleware: handle SGLang startup lag gracefully

- Add /health endpoint that returns 503 until SGLang is ready
- Background task polls SGLang until it accepts connections
- Catch ConnectError/TimeoutException instead of crashing
- Return 503 JSON error when SGLang backend is unavailable
- haproxy health-checks middleware /health, which reflects SGLang state
This commit is contained in:
2026-04-12 19:06:38 +00:00
parent bbe40ac8c0
commit db9231f796

View File

@@ -8,7 +8,7 @@ Currently strips: logprobs, top_logprobs
(SGLang's Mistral tool-call parser rejects these; vLLM accepts them.)
Architecture:
haproxy (original port) → middleware (port+2) → SGLang (port+1)
haproxy (port N) → middleware (port N+2) → SGLang (port N+1)
haproxy still handles /metrics stub and /health instant responses.
This middleware only touches the proxied request bodies.
@@ -16,11 +16,13 @@ This middleware only touches the proxied request bodies.
import json
import os
import asyncio
import httpx
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse, Response
import uvicorn
SGLANG_HOST = os.environ.get("SGLANG_HOST", "127.0.0.1")
SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001"))
LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002"))
@@ -30,15 +32,35 @@ STRIP_PARAMS = {"logprobs", "top_logprobs"}
app = FastAPI()
client: httpx.AsyncClient | None = None
_sglang_ready = False
@app.on_event("startup")
async def startup():
global client
client = httpx.AsyncClient(
base_url=f"http://127.0.0.1:{SGLANG_PORT}",
timeout=httpx.Timeout(300.0),
timeout=httpx.Timeout(300.0, connect=10.0),
)
# Background task: wait for SGLang to become available
asyncio.create_task(_wait_for_sglang())
async def _wait_for_sglang():
"""Poll SGLang until it's accepting connections, then mark ready."""
global _sglang_ready
while True:
try:
resp = await client.get(
f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
timeout=httpx.Timeout(5.0, connect=2.0),
)
if resp.status_code == 200:
_sglang_ready = True
print(f"Middleware: SGLang is ready at {SGLANG_HOST}:{SGLANG_PORT}")
return
except (httpx.ConnectError, httpx.TimeoutException):
pass
await asyncio.sleep(2)
@app.on_event("shutdown")
@@ -46,6 +68,25 @@ async def shutdown():
await client.aclose()
@app.get("/health")
async def health():
"""Health check — haproxy polls this. Returns 200 only if SGLang is up."""
if not _sglang_ready:
return Response(content="SGLang not ready", status_code=503)
try:
resp = await client.get(
f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
timeout=httpx.Timeout(5.0, connect=2.0),
)
return Response(content=resp.content, status_code=resp.status_code,
media_type=resp.headers.get("content-type"))
except (httpx.ConnectError, httpx.TimeoutException):
_sglang_ready = False
# Re-trigger background wait
asyncio.create_task(_wait_for_sglang())
return Response(content="SGLang not ready", status_code=503)
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
async def proxy(path: str, request: Request):
body = await request.body()
@@ -73,10 +114,11 @@ async def proxy(path: str, request: Request):
}
fwd_headers["content-length"] = str(len(body))
url = f"http://127.0.0.1:{SGLANG_PORT}/{path}"
url = f"http://{SGLANG_HOST}:{SGLANG_PORT}/{path}"
if request.query_params:
url += f"?{request.query_params}"
try:
if is_streaming:
req = client.build_request(request.method, url, content=body, headers=fwd_headers)
resp = await client.send(req, stream=True)
@@ -100,6 +142,12 @@ async def proxy(path: str, request: Request):
status_code=resp.status_code,
media_type=resp.headers.get("content-type"),
)
except (httpx.ConnectError, httpx.TimeoutException) as e:
return Response(
content=json.dumps({"error": {"message": f"SGLang backend unavailable: {e}", "type": "backend_error"}}),
status_code=503,
media_type="application/json",
)
if __name__ == "__main__":