Fix middleware: handle SGLang startup lag gracefully
- Add /health endpoint that returns 503 until SGLang is ready - Background task polls SGLang until it accepts connections - Catch ConnectError/TimeoutException instead of crashing - Return 503 JSON error when SGLang backend is unavailable - haproxy health-checks middleware /health, which reflects SGLang state
This commit is contained in:
@@ -8,7 +8,7 @@ Currently strips: logprobs, top_logprobs
|
||||
(SGLang's Mistral tool-call parser rejects these; vLLM accepts them.)
|
||||
|
||||
Architecture:
|
||||
haproxy (original port) → middleware (port+2) → SGLang (port+1)
|
||||
haproxy (port N) → middleware (port N+2) → SGLang (port N+1)
|
||||
|
||||
haproxy still handles /metrics stub and /health instant responses.
|
||||
This middleware only touches the proxied request bodies.
|
||||
@@ -16,11 +16,13 @@ This middleware only touches the proxied request bodies.
|
||||
|
||||
import json
|
||||
import os
|
||||
import asyncio
|
||||
import httpx
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import StreamingResponse, Response
|
||||
import uvicorn
|
||||
|
||||
SGLANG_HOST = os.environ.get("SGLANG_HOST", "127.0.0.1")
|
||||
SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001"))
|
||||
LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002"))
|
||||
|
||||
@@ -30,15 +32,35 @@ STRIP_PARAMS = {"logprobs", "top_logprobs"}
|
||||
|
||||
app = FastAPI()
|
||||
client: httpx.AsyncClient | None = None
|
||||
_sglang_ready = False
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
global client
|
||||
client = httpx.AsyncClient(
|
||||
base_url=f"http://127.0.0.1:{SGLANG_PORT}",
|
||||
timeout=httpx.Timeout(300.0),
|
||||
timeout=httpx.Timeout(300.0, connect=10.0),
|
||||
)
|
||||
# Background task: wait for SGLang to become available
|
||||
asyncio.create_task(_wait_for_sglang())
|
||||
|
||||
|
||||
async def _wait_for_sglang():
|
||||
"""Poll SGLang until it's accepting connections, then mark ready."""
|
||||
global _sglang_ready
|
||||
while True:
|
||||
try:
|
||||
resp = await client.get(
|
||||
f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
|
||||
timeout=httpx.Timeout(5.0, connect=2.0),
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
_sglang_ready = True
|
||||
print(f"Middleware: SGLang is ready at {SGLANG_HOST}:{SGLANG_PORT}")
|
||||
return
|
||||
except (httpx.ConnectError, httpx.TimeoutException):
|
||||
pass
|
||||
await asyncio.sleep(2)
|
||||
|
||||
|
||||
@app.on_event("shutdown")
|
||||
@@ -46,6 +68,25 @@ async def shutdown():
|
||||
await client.aclose()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
"""Health check — haproxy polls this. Returns 200 only if SGLang is up."""
|
||||
if not _sglang_ready:
|
||||
return Response(content="SGLang not ready", status_code=503)
|
||||
try:
|
||||
resp = await client.get(
|
||||
f"http://{SGLANG_HOST}:{SGLANG_PORT}/health",
|
||||
timeout=httpx.Timeout(5.0, connect=2.0),
|
||||
)
|
||||
return Response(content=resp.content, status_code=resp.status_code,
|
||||
media_type=resp.headers.get("content-type"))
|
||||
except (httpx.ConnectError, httpx.TimeoutException):
|
||||
_sglang_ready = False
|
||||
# Re-trigger background wait
|
||||
asyncio.create_task(_wait_for_sglang())
|
||||
return Response(content="SGLang not ready", status_code=503)
|
||||
|
||||
|
||||
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
|
||||
async def proxy(path: str, request: Request):
|
||||
body = await request.body()
|
||||
@@ -73,10 +114,11 @@ async def proxy(path: str, request: Request):
|
||||
}
|
||||
fwd_headers["content-length"] = str(len(body))
|
||||
|
||||
url = f"http://127.0.0.1:{SGLANG_PORT}/{path}"
|
||||
url = f"http://{SGLANG_HOST}:{SGLANG_PORT}/{path}"
|
||||
if request.query_params:
|
||||
url += f"?{request.query_params}"
|
||||
|
||||
try:
|
||||
if is_streaming:
|
||||
req = client.build_request(request.method, url, content=body, headers=fwd_headers)
|
||||
resp = await client.send(req, stream=True)
|
||||
@@ -100,6 +142,12 @@ async def proxy(path: str, request: Request):
|
||||
status_code=resp.status_code,
|
||||
media_type=resp.headers.get("content-type"),
|
||||
)
|
||||
except (httpx.ConnectError, httpx.TimeoutException) as e:
|
||||
return Response(
|
||||
content=json.dumps({"error": {"message": f"SGLang backend unavailable: {e}", "type": "backend_error"}}),
|
||||
status_code=503,
|
||||
media_type="application/json",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user