Add middleware to strip vLLM-only params (logprobs/top_logprobs) before forwarding to SGLang

SGLang's Mistral tool-call parser rejects logprobs/top_logprobs with 422, while vLLM accepts them. Clients like OpenClaw send these by default. New architecture: haproxy (port N) → middleware (port N+2) → SGLang (port N+1) The middleware is a thin FastAPI app that strips incompatible params from chat completion request bodies and passes everything else through unchanged.
2026-04-12 18:58:37 +00:00
parent 359aa94337
commit bbe40ac8c0
5 changed files with 160 additions and 11 deletions
--- a/1
+++ b/1
@@ -18,6 +18,7 @@ RUN mkdir -p /opt/vllm-shim/vllm/entrypoints/openai \
 COPY vllm_shim_module.py /opt/vllm-shim/vllm/__main__.py
 COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/openai/api_server.py
 COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/cli/main.py
+COPY vllm_middleware.py /opt/vllm-shim/vllm_middleware.py
 RUN touch /opt/vllm-shim/vllm/__init__.py \
          /opt/vllm-shim/vllm/entrypoints/__init__.py \
          /opt/vllm-shim/vllm/entrypoints/openai/__init__.py \
--- a/README.md
+++ b/README.md
@@ -27,6 +27,14 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy*

 2. **`/health` probe timing** — SGLang's `/health` endpoint takes ~1.001s to respond, which races the 1s k8s probe timeout and causes repeated `Startup probe failed: context deadline exceeded`. haproxy health-checks SGLang in the background (every 5s, with a 3s timeout) and responds to `/health` probes **instantly** — 200 if the backend is up, 503 if it's not. No more timeout roulette.

+### middleware layer
+
+A Python middleware (FastAPI) sits between haproxy and SGLang on **port+2**. It strips vLLM-only request parameters that SGLang rejects with 422 errors:
+
+- **`logprobs`** / **`top_logprobs`** — vLLM accepts these on chat completion requests; SGLang's Mistral tool-call parser rejects them. OpenClaw and other vLLM clients send them by default.
+
+The middleware only touches `POST /v1/chat/completions` request bodies and passes everything else through unchanged. To strip additional params, add them to the `STRIP_PARAMS` set in `vllm_middleware.py`.
+
 ```
 ┌─────────────────────────────────────────────┐
 │  k8s probes / vLLM stack                    │
@@ -36,7 +44,12 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy*
 │    /metrics ──► 200 empty (stub)            │
 │    /health  ──► 200/503 instant (backend    │
 │                 health-checked in bg)        │
-│    /*       ──► proxy to SGLang             │
+│    /*       ──► proxy to middleware          │
+│                       │                     │
+│                       ▼                     │
+│  middleware (port 8002)                      │
+│    strips logprobs/top_logprobs             │
+│    forwards to SGLang                       │
 │                       │                     │
 │                       ▼                     │
 │              SGLang (port 8001)             │
@@ -86,5 +99,6 @@ To adapt for a different model, change `--model-path`, `--tp`, and `--tool-call-
 | File | Purpose |
 |---|---|
 | `Dockerfile` | Builds the image: ROCm SGLang base + haproxy + shims + MI300X env |
-| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + haproxy |
-| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + haproxy |
+| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + middleware + haproxy |
+| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + middleware + haproxy |
+| `vllm_middleware.py` | FastAPI middleware — strips vLLM-only params (logprobs) before forwarding to SGLang |
--- a/vllm-shim.sh
+++ b/vllm-shim.sh
@@ -63,9 +63,12 @@ while [[ $# -gt 0 ]]; do
 done

 # SGLang runs one port higher; haproxy binds the original port
+# Middleware runs two ports higher (strips vLLM-only params)
 SGLANG_PORT=$((PORT + 1))
+MIDDLEWARE_PORT=$((PORT + 2))

 echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)"
+echo "Launching middleware on ${HOST}:${MIDDLEWARE_PORT} (strips logprobs)"
 echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics + /health stub)"
 echo ""

@@ -109,7 +112,7 @@ frontend proxy
 backend sglang
  option httpchk GET /health
  http-check expect status 200
-  server s1 127.0.0.1:${SGLANG_PORT} check inter 5s fall 3 rise 2
+  server s1 127.0.0.1:${MIDDLEWARE_PORT} check inter 5s fall 3 rise 2
 EOF

 echo "haproxy config written to ${HAPROXY_CFG}" >> "$LOG_PATH"
@@ -124,6 +127,12 @@ python -m sglang.launch_server \

 SGLANG_PID=$!

+# Start the middleware (strips vLLM-only params like logprobs)
+SGLANG_PORT=$SGLANG_PORT MIDDLEWARE_PORT=$MIDDLEWARE_PORT \
+  python /opt/vllm-shim/vllm_middleware.py &
+
+MIDDLEWARE_PID=$!
+
 # Give SGLang a moment to start before haproxy starts routing
 sleep 2

@@ -132,11 +141,11 @@ haproxy -f "$HAPROXY_CFG" &

 HAPROXY_PID=$!

-echo "SGLang PID: ${SGLANG_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH"
+echo "SGLang PID: ${SGLANG_PID}, middleware PID: ${MIDDLEWARE_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH"

 # Wait for whichever dies first — if either goes, we go
-wait -n "$SGLANG_PID" "$HAPROXY_PID"
+wait -n "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID"
 EXIT_CODE=$?
 echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH"
-kill "$SGLANG_PID" "$HAPROXY_PID" 2>/dev/null || true
+kill "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" 2>/dev/null || true
 exit $EXIT_CODE
--- a/vllm_middleware.py
+++ b/vllm_middleware.py
@@ -0,0 +1,106 @@
+"""
+vLLM → SGLang request middleware.
+
+Sits between haproxy and SGLang to strip vLLM-only parameters
+that cause SGLang to return 422/400 errors.
+
+Currently strips: logprobs, top_logprobs
+(SGLang's Mistral tool-call parser rejects these; vLLM accepts them.)
+
+Architecture:
+  haproxy (original port) → middleware (port+2) → SGLang (port+1)
+
+haproxy still handles /metrics stub and /health instant responses.
+This middleware only touches the proxied request bodies.
+"""
+
+import json
+import os
+import httpx
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse, Response
+import uvicorn
+
+SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001"))
+LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002"))
+
+# Params that vLLM accepts but SGLang rejects.
+# Extend this set as more incompatibilities are discovered.
+STRIP_PARAMS = {"logprobs", "top_logprobs"}
+
+app = FastAPI()
+client: httpx.AsyncClient | None = None
+
+
+@app.on_event("startup")
+async def startup():
+    global client
+    client = httpx.AsyncClient(
+        base_url=f"http://127.0.0.1:{SGLANG_PORT}",
+        timeout=httpx.Timeout(300.0),
+    )
+
+
+@app.on_event("shutdown")
+async def shutdown():
+    await client.aclose()
+
+
+@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
+async def proxy(path: str, request: Request):
+    body = await request.body()
+    is_streaming = False
+
+    # Strip incompatible params from chat completion POST requests
+    if request.method == "POST" and "chat/completions" in path and body:
+        try:
+            data = json.loads(body)
+            is_streaming = data.get("stream", False)
+            stripped_any = False
+            for key in STRIP_PARAMS:
+                if key in data:
+                    del data[key]
+                    stripped_any = True
+            if stripped_any:
+                body = json.dumps(data).encode()
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            pass
+
+    # Forward headers (skip hop-by-hop and ones we're replacing)
+    fwd_headers = {
+        k: v for k, v in request.headers.items()
+        if k.lower() not in ("host", "content-length", "transfer-encoding")
+    }
+    fwd_headers["content-length"] = str(len(body))
+
+    url = f"http://127.0.0.1:{SGLANG_PORT}/{path}"
+    if request.query_params:
+        url += f"?{request.query_params}"
+
+    if is_streaming:
+        req = client.build_request(request.method, url, content=body, headers=fwd_headers)
+        resp = await client.send(req, stream=True)
+
+        async def stream_body():
+            try:
+                async for chunk in resp.aiter_bytes():
+                    yield chunk
+            finally:
+                await resp.aclose()
+
+        return StreamingResponse(
+            stream_body(),
+            status_code=resp.status_code,
+            headers={"content-type": resp.headers.get("content-type", "text/event-stream")},
+        )
+    else:
+        resp = await client.request(request.method, url, content=body, headers=fwd_headers)
+        return Response(
+            content=resp.content,
+            status_code=resp.status_code,
+            media_type=resp.headers.get("content-type"),
+        )
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=LISTEN_PORT, log_level="warning")
--- a/vllm_shim_module.py
+++ b/vllm_shim_module.py
@@ -63,10 +63,12 @@ def main():
        else:
            i += 1

-    # SGLang runs one port higher; haproxy binds the original port
+    # SGLang runs one port higher; middleware two ports higher
    sglang_port = str(int(port) + 1)
+    middleware_port = str(int(port) + 2)

    print(f"Launching SGLang on {host}:{sglang_port} (internal)")
+    print(f"Launching middleware on {host}:{middleware_port} (strips logprobs)")
    print(f"Launching haproxy on {host}:{port} (front door, /metrics + /health stub)")
    print()

@@ -112,12 +114,12 @@ frontend proxy
 backend sglang
  option httpchk GET /health
  http-check expect status 200
-  server s1 127.0.0.1:{sglang_port} check inter 5s fall 3 rise 2
+  server s1 127.0.0.1:{middleware_port} check inter 5s fall 3 rise 2
 """)

    with open(log_path, "a") as f:
        f.write(f"haproxy config written to {haproxy_cfg}\n")
-        f.write(f"SGLang port: {sglang_port}, haproxy port: {port}\n")
+        f.write(f"SGLang port: {sglang_port}, middleware port: {middleware_port}, haproxy port: {port}\n")

    # Start SGLang in the background
    sglang_proc = subprocess.Popen(
@@ -131,6 +133,15 @@ backend sglang
        ],
    )

+    # Start the middleware (strips vLLM-only params like logprobs)
+    middleware_env = os.environ.copy()
+    middleware_env["SGLANG_PORT"] = sglang_port
+    middleware_env["MIDDLEWARE_PORT"] = middleware_port
+    middleware_proc = subprocess.Popen(
+        [sys.executable, "/opt/vllm-shim/vllm_middleware.py"],
+        env=middleware_env,
+    )
+
    # Give SGLang a moment before haproxy starts routing
    time.sleep(2)

@@ -138,19 +149,27 @@ backend sglang
    haproxy_proc = subprocess.Popen(["haproxy", "-f", haproxy_cfg])

    with open(log_path, "a") as f:
-        f.write(f"SGLang PID: {sglang_proc.pid}, haproxy PID: {haproxy_proc.pid}\n")
+        f.write(f"SGLang PID: {sglang_proc.pid}, middleware PID: {middleware_proc.pid}, haproxy PID: {haproxy_proc.pid}\n")

    # Wait for whichever dies first
    while True:
        sglang_ret = sglang_proc.poll()
+        middleware_ret = middleware_proc.poll()
        haproxy_ret = haproxy_proc.poll()
        if sglang_ret is not None:
            print(f"SGLang exited (code {sglang_ret}), shutting down")
+            middleware_proc.terminate()
            haproxy_proc.terminate()
            os._exit(sglang_ret)
+        if middleware_ret is not None:
+            print(f"Middleware exited (code {middleware_ret}), shutting down")
+            sglang_proc.terminate()
+            haproxy_proc.terminate()
+            os._exit(middleware_ret)
        if haproxy_ret is not None:
            print(f"haproxy exited (code {haproxy_ret}), shutting down")
            sglang_proc.terminate()
+            middleware_proc.terminate()
            os._exit(haproxy_ret)
        time.sleep(1)