From bbe40ac8c00fedbe01b678f7103515ce5a791b23 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 12 Apr 2026 18:58:37 +0000 Subject: [PATCH] Add middleware to strip vLLM-only params (logprobs/top_logprobs) before forwarding to SGLang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SGLang's Mistral tool-call parser rejects logprobs/top_logprobs with 422, while vLLM accepts them. Clients like OpenClaw send these by default. New architecture: haproxy (port N) → middleware (port N+2) → SGLang (port N+1) The middleware is a thin FastAPI app that strips incompatible params from chat completion request bodies and passes everything else through unchanged. --- Dockerfile | 1 + README.md | 20 +++++++-- vllm-shim.sh | 17 +++++-- vllm_middleware.py | 106 ++++++++++++++++++++++++++++++++++++++++++++ vllm_shim_module.py | 27 +++++++++-- 5 files changed, 160 insertions(+), 11 deletions(-) create mode 100644 vllm_middleware.py diff --git a/Dockerfile b/Dockerfile index bc0aeed..a2517a6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,6 +18,7 @@ RUN mkdir -p /opt/vllm-shim/vllm/entrypoints/openai \ COPY vllm_shim_module.py /opt/vllm-shim/vllm/__main__.py COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/openai/api_server.py COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/cli/main.py +COPY vllm_middleware.py /opt/vllm-shim/vllm_middleware.py RUN touch /opt/vllm-shim/vllm/__init__.py \ /opt/vllm-shim/vllm/entrypoints/__init__.py \ /opt/vllm-shim/vllm/entrypoints/openai/__init__.py \ diff --git a/README.md b/README.md index eb99250..f358acf 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,14 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy* 2. **`/health` probe timing** — SGLang's `/health` endpoint takes ~1.001s to respond, which races the 1s k8s probe timeout and causes repeated `Startup probe failed: context deadline exceeded`. haproxy health-checks SGLang in the background (every 5s, with a 3s timeout) and responds to `/health` probes **instantly** — 200 if the backend is up, 503 if it's not. No more timeout roulette. +### middleware layer + +A Python middleware (FastAPI) sits between haproxy and SGLang on **port+2**. It strips vLLM-only request parameters that SGLang rejects with 422 errors: + +- **`logprobs`** / **`top_logprobs`** — vLLM accepts these on chat completion requests; SGLang's Mistral tool-call parser rejects them. OpenClaw and other vLLM clients send them by default. + +The middleware only touches `POST /v1/chat/completions` request bodies and passes everything else through unchanged. To strip additional params, add them to the `STRIP_PARAMS` set in `vllm_middleware.py`. + ``` ┌─────────────────────────────────────────────┐ │ k8s probes / vLLM stack │ @@ -36,7 +44,12 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy* │ /metrics ──► 200 empty (stub) │ │ /health ──► 200/503 instant (backend │ │ health-checked in bg) │ -│ /* ──► proxy to SGLang │ +│ /* ──► proxy to middleware │ +│ │ │ +│ ▼ │ +│ middleware (port 8002) │ +│ strips logprobs/top_logprobs │ +│ forwards to SGLang │ │ │ │ │ ▼ │ │ SGLang (port 8001) │ @@ -86,5 +99,6 @@ To adapt for a different model, change `--model-path`, `--tp`, and `--tool-call- | File | Purpose | |---|---| | `Dockerfile` | Builds the image: ROCm SGLang base + haproxy + shims + MI300X env | -| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + haproxy | -| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + haproxy | +| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + middleware + haproxy | +| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + middleware + haproxy | +| `vllm_middleware.py` | FastAPI middleware — strips vLLM-only params (logprobs) before forwarding to SGLang | diff --git a/vllm-shim.sh b/vllm-shim.sh index 26fafa2..5cf24d4 100644 --- a/vllm-shim.sh +++ b/vllm-shim.sh @@ -63,9 +63,12 @@ while [[ $# -gt 0 ]]; do done # SGLang runs one port higher; haproxy binds the original port +# Middleware runs two ports higher (strips vLLM-only params) SGLANG_PORT=$((PORT + 1)) +MIDDLEWARE_PORT=$((PORT + 2)) echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)" +echo "Launching middleware on ${HOST}:${MIDDLEWARE_PORT} (strips logprobs)" echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics + /health stub)" echo "" @@ -109,7 +112,7 @@ frontend proxy backend sglang option httpchk GET /health http-check expect status 200 - server s1 127.0.0.1:${SGLANG_PORT} check inter 5s fall 3 rise 2 + server s1 127.0.0.1:${MIDDLEWARE_PORT} check inter 5s fall 3 rise 2 EOF echo "haproxy config written to ${HAPROXY_CFG}" >> "$LOG_PATH" @@ -124,6 +127,12 @@ python -m sglang.launch_server \ SGLANG_PID=$! +# Start the middleware (strips vLLM-only params like logprobs) +SGLANG_PORT=$SGLANG_PORT MIDDLEWARE_PORT=$MIDDLEWARE_PORT \ + python /opt/vllm-shim/vllm_middleware.py & + +MIDDLEWARE_PID=$! + # Give SGLang a moment to start before haproxy starts routing sleep 2 @@ -132,11 +141,11 @@ haproxy -f "$HAPROXY_CFG" & HAPROXY_PID=$! -echo "SGLang PID: ${SGLANG_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH" +echo "SGLang PID: ${SGLANG_PID}, middleware PID: ${MIDDLEWARE_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH" # Wait for whichever dies first — if either goes, we go -wait -n "$SGLANG_PID" "$HAPROXY_PID" +wait -n "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" EXIT_CODE=$? echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH" -kill "$SGLANG_PID" "$HAPROXY_PID" 2>/dev/null || true +kill "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" 2>/dev/null || true exit $EXIT_CODE diff --git a/vllm_middleware.py b/vllm_middleware.py new file mode 100644 index 0000000..088dbe9 --- /dev/null +++ b/vllm_middleware.py @@ -0,0 +1,106 @@ +""" +vLLM → SGLang request middleware. + +Sits between haproxy and SGLang to strip vLLM-only parameters +that cause SGLang to return 422/400 errors. + +Currently strips: logprobs, top_logprobs +(SGLang's Mistral tool-call parser rejects these; vLLM accepts them.) + +Architecture: + haproxy (original port) → middleware (port+2) → SGLang (port+1) + +haproxy still handles /metrics stub and /health instant responses. +This middleware only touches the proxied request bodies. +""" + +import json +import os +import httpx +from fastapi import FastAPI, Request +from fastapi.responses import StreamingResponse, Response +import uvicorn + +SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001")) +LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002")) + +# Params that vLLM accepts but SGLang rejects. +# Extend this set as more incompatibilities are discovered. +STRIP_PARAMS = {"logprobs", "top_logprobs"} + +app = FastAPI() +client: httpx.AsyncClient | None = None + + +@app.on_event("startup") +async def startup(): + global client + client = httpx.AsyncClient( + base_url=f"http://127.0.0.1:{SGLANG_PORT}", + timeout=httpx.Timeout(300.0), + ) + + +@app.on_event("shutdown") +async def shutdown(): + await client.aclose() + + +@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"]) +async def proxy(path: str, request: Request): + body = await request.body() + is_streaming = False + + # Strip incompatible params from chat completion POST requests + if request.method == "POST" and "chat/completions" in path and body: + try: + data = json.loads(body) + is_streaming = data.get("stream", False) + stripped_any = False + for key in STRIP_PARAMS: + if key in data: + del data[key] + stripped_any = True + if stripped_any: + body = json.dumps(data).encode() + except (json.JSONDecodeError, UnicodeDecodeError): + pass + + # Forward headers (skip hop-by-hop and ones we're replacing) + fwd_headers = { + k: v for k, v in request.headers.items() + if k.lower() not in ("host", "content-length", "transfer-encoding") + } + fwd_headers["content-length"] = str(len(body)) + + url = f"http://127.0.0.1:{SGLANG_PORT}/{path}" + if request.query_params: + url += f"?{request.query_params}" + + if is_streaming: + req = client.build_request(request.method, url, content=body, headers=fwd_headers) + resp = await client.send(req, stream=True) + + async def stream_body(): + try: + async for chunk in resp.aiter_bytes(): + yield chunk + finally: + await resp.aclose() + + return StreamingResponse( + stream_body(), + status_code=resp.status_code, + headers={"content-type": resp.headers.get("content-type", "text/event-stream")}, + ) + else: + resp = await client.request(request.method, url, content=body, headers=fwd_headers) + return Response( + content=resp.content, + status_code=resp.status_code, + media_type=resp.headers.get("content-type"), + ) + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=LISTEN_PORT, log_level="warning") diff --git a/vllm_shim_module.py b/vllm_shim_module.py index 77b471f..f0d20e5 100644 --- a/vllm_shim_module.py +++ b/vllm_shim_module.py @@ -63,10 +63,12 @@ def main(): else: i += 1 - # SGLang runs one port higher; haproxy binds the original port + # SGLang runs one port higher; middleware two ports higher sglang_port = str(int(port) + 1) + middleware_port = str(int(port) + 2) print(f"Launching SGLang on {host}:{sglang_port} (internal)") + print(f"Launching middleware on {host}:{middleware_port} (strips logprobs)") print(f"Launching haproxy on {host}:{port} (front door, /metrics + /health stub)") print() @@ -112,12 +114,12 @@ frontend proxy backend sglang option httpchk GET /health http-check expect status 200 - server s1 127.0.0.1:{sglang_port} check inter 5s fall 3 rise 2 + server s1 127.0.0.1:{middleware_port} check inter 5s fall 3 rise 2 """) with open(log_path, "a") as f: f.write(f"haproxy config written to {haproxy_cfg}\n") - f.write(f"SGLang port: {sglang_port}, haproxy port: {port}\n") + f.write(f"SGLang port: {sglang_port}, middleware port: {middleware_port}, haproxy port: {port}\n") # Start SGLang in the background sglang_proc = subprocess.Popen( @@ -131,6 +133,15 @@ backend sglang ], ) + # Start the middleware (strips vLLM-only params like logprobs) + middleware_env = os.environ.copy() + middleware_env["SGLANG_PORT"] = sglang_port + middleware_env["MIDDLEWARE_PORT"] = middleware_port + middleware_proc = subprocess.Popen( + [sys.executable, "/opt/vllm-shim/vllm_middleware.py"], + env=middleware_env, + ) + # Give SGLang a moment before haproxy starts routing time.sleep(2) @@ -138,19 +149,27 @@ backend sglang haproxy_proc = subprocess.Popen(["haproxy", "-f", haproxy_cfg]) with open(log_path, "a") as f: - f.write(f"SGLang PID: {sglang_proc.pid}, haproxy PID: {haproxy_proc.pid}\n") + f.write(f"SGLang PID: {sglang_proc.pid}, middleware PID: {middleware_proc.pid}, haproxy PID: {haproxy_proc.pid}\n") # Wait for whichever dies first while True: sglang_ret = sglang_proc.poll() + middleware_ret = middleware_proc.poll() haproxy_ret = haproxy_proc.poll() if sglang_ret is not None: print(f"SGLang exited (code {sglang_ret}), shutting down") + middleware_proc.terminate() haproxy_proc.terminate() os._exit(sglang_ret) + if middleware_ret is not None: + print(f"Middleware exited (code {middleware_ret}), shutting down") + sglang_proc.terminate() + haproxy_proc.terminate() + os._exit(middleware_ret) if haproxy_ret is not None: print(f"haproxy exited (code {haproxy_ret}), shutting down") sglang_proc.terminate() + middleware_proc.terminate() os._exit(haproxy_ret) time.sleep(1)