Add middleware to strip vLLM-only params (logprobs/top_logprobs) before forwarding to SGLang
SGLang's Mistral tool-call parser rejects logprobs/top_logprobs with 422, while vLLM accepts them. Clients like OpenClaw send these by default. New architecture: haproxy (port N) → middleware (port N+2) → SGLang (port N+1) The middleware is a thin FastAPI app that strips incompatible params from chat completion request bodies and passes everything else through unchanged.
This commit is contained in:
@@ -18,6 +18,7 @@ RUN mkdir -p /opt/vllm-shim/vllm/entrypoints/openai \
|
|||||||
COPY vllm_shim_module.py /opt/vllm-shim/vllm/__main__.py
|
COPY vllm_shim_module.py /opt/vllm-shim/vllm/__main__.py
|
||||||
COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/openai/api_server.py
|
COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/openai/api_server.py
|
||||||
COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/cli/main.py
|
COPY vllm_shim_module.py /opt/vllm-shim/vllm/entrypoints/cli/main.py
|
||||||
|
COPY vllm_middleware.py /opt/vllm-shim/vllm_middleware.py
|
||||||
RUN touch /opt/vllm-shim/vllm/__init__.py \
|
RUN touch /opt/vllm-shim/vllm/__init__.py \
|
||||||
/opt/vllm-shim/vllm/entrypoints/__init__.py \
|
/opt/vllm-shim/vllm/entrypoints/__init__.py \
|
||||||
/opt/vllm-shim/vllm/entrypoints/openai/__init__.py \
|
/opt/vllm-shim/vllm/entrypoints/openai/__init__.py \
|
||||||
|
|||||||
20
README.md
20
README.md
@@ -27,6 +27,14 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy*
|
|||||||
|
|
||||||
2. **`/health` probe timing** — SGLang's `/health` endpoint takes ~1.001s to respond, which races the 1s k8s probe timeout and causes repeated `Startup probe failed: context deadline exceeded`. haproxy health-checks SGLang in the background (every 5s, with a 3s timeout) and responds to `/health` probes **instantly** — 200 if the backend is up, 503 if it's not. No more timeout roulette.
|
2. **`/health` probe timing** — SGLang's `/health` endpoint takes ~1.001s to respond, which races the 1s k8s probe timeout and causes repeated `Startup probe failed: context deadline exceeded`. haproxy health-checks SGLang in the background (every 5s, with a 3s timeout) and responds to `/health` probes **instantly** — 200 if the backend is up, 503 if it's not. No more timeout roulette.
|
||||||
|
|
||||||
|
### middleware layer
|
||||||
|
|
||||||
|
A Python middleware (FastAPI) sits between haproxy and SGLang on **port+2**. It strips vLLM-only request parameters that SGLang rejects with 422 errors:
|
||||||
|
|
||||||
|
- **`logprobs`** / **`top_logprobs`** — vLLM accepts these on chat completion requests; SGLang's Mistral tool-call parser rejects them. OpenClaw and other vLLM clients send them by default.
|
||||||
|
|
||||||
|
The middleware only touches `POST /v1/chat/completions` request bodies and passes everything else through unchanged. To strip additional params, add them to the `STRIP_PARAMS` set in `vllm_middleware.py`.
|
||||||
|
|
||||||
```
|
```
|
||||||
┌─────────────────────────────────────────────┐
|
┌─────────────────────────────────────────────┐
|
||||||
│ k8s probes / vLLM stack │
|
│ k8s probes / vLLM stack │
|
||||||
@@ -36,7 +44,12 @@ Rather than launching SGLang directly on the vLLM port, the shim runs **haproxy*
|
|||||||
│ /metrics ──► 200 empty (stub) │
|
│ /metrics ──► 200 empty (stub) │
|
||||||
│ /health ──► 200/503 instant (backend │
|
│ /health ──► 200/503 instant (backend │
|
||||||
│ health-checked in bg) │
|
│ health-checked in bg) │
|
||||||
│ /* ──► proxy to SGLang │
|
│ /* ──► proxy to middleware │
|
||||||
|
│ │ │
|
||||||
|
│ ▼ │
|
||||||
|
│ middleware (port 8002) │
|
||||||
|
│ strips logprobs/top_logprobs │
|
||||||
|
│ forwards to SGLang │
|
||||||
│ │ │
|
│ │ │
|
||||||
│ ▼ │
|
│ ▼ │
|
||||||
│ SGLang (port 8001) │
|
│ SGLang (port 8001) │
|
||||||
@@ -86,5 +99,6 @@ To adapt for a different model, change `--model-path`, `--tp`, and `--tool-call-
|
|||||||
| File | Purpose |
|
| File | Purpose |
|
||||||
|---|---|
|
|---|---|
|
||||||
| `Dockerfile` | Builds the image: ROCm SGLang base + haproxy + shims + MI300X env |
|
| `Dockerfile` | Builds the image: ROCm SGLang base + haproxy + shims + MI300X env |
|
||||||
| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + haproxy |
|
| `vllm-shim.sh` | Shell shim — replaces the `vllm` binary, launches SGLang + middleware + haproxy |
|
||||||
| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + haproxy |
|
| `vllm_shim_module.py` | Python shim — shadows `vllm.*` module imports, launches SGLang + middleware + haproxy |
|
||||||
|
| `vllm_middleware.py` | FastAPI middleware — strips vLLM-only params (logprobs) before forwarding to SGLang |
|
||||||
|
|||||||
17
vllm-shim.sh
17
vllm-shim.sh
@@ -63,9 +63,12 @@ while [[ $# -gt 0 ]]; do
|
|||||||
done
|
done
|
||||||
|
|
||||||
# SGLang runs one port higher; haproxy binds the original port
|
# SGLang runs one port higher; haproxy binds the original port
|
||||||
|
# Middleware runs two ports higher (strips vLLM-only params)
|
||||||
SGLANG_PORT=$((PORT + 1))
|
SGLANG_PORT=$((PORT + 1))
|
||||||
|
MIDDLEWARE_PORT=$((PORT + 2))
|
||||||
|
|
||||||
echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)"
|
echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)"
|
||||||
|
echo "Launching middleware on ${HOST}:${MIDDLEWARE_PORT} (strips logprobs)"
|
||||||
echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics + /health stub)"
|
echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics + /health stub)"
|
||||||
echo ""
|
echo ""
|
||||||
|
|
||||||
@@ -109,7 +112,7 @@ frontend proxy
|
|||||||
backend sglang
|
backend sglang
|
||||||
option httpchk GET /health
|
option httpchk GET /health
|
||||||
http-check expect status 200
|
http-check expect status 200
|
||||||
server s1 127.0.0.1:${SGLANG_PORT} check inter 5s fall 3 rise 2
|
server s1 127.0.0.1:${MIDDLEWARE_PORT} check inter 5s fall 3 rise 2
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
echo "haproxy config written to ${HAPROXY_CFG}" >> "$LOG_PATH"
|
echo "haproxy config written to ${HAPROXY_CFG}" >> "$LOG_PATH"
|
||||||
@@ -124,6 +127,12 @@ python -m sglang.launch_server \
|
|||||||
|
|
||||||
SGLANG_PID=$!
|
SGLANG_PID=$!
|
||||||
|
|
||||||
|
# Start the middleware (strips vLLM-only params like logprobs)
|
||||||
|
SGLANG_PORT=$SGLANG_PORT MIDDLEWARE_PORT=$MIDDLEWARE_PORT \
|
||||||
|
python /opt/vllm-shim/vllm_middleware.py &
|
||||||
|
|
||||||
|
MIDDLEWARE_PID=$!
|
||||||
|
|
||||||
# Give SGLang a moment to start before haproxy starts routing
|
# Give SGLang a moment to start before haproxy starts routing
|
||||||
sleep 2
|
sleep 2
|
||||||
|
|
||||||
@@ -132,11 +141,11 @@ haproxy -f "$HAPROXY_CFG" &
|
|||||||
|
|
||||||
HAPROXY_PID=$!
|
HAPROXY_PID=$!
|
||||||
|
|
||||||
echo "SGLang PID: ${SGLANG_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH"
|
echo "SGLang PID: ${SGLANG_PID}, middleware PID: ${MIDDLEWARE_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH"
|
||||||
|
|
||||||
# Wait for whichever dies first — if either goes, we go
|
# Wait for whichever dies first — if either goes, we go
|
||||||
wait -n "$SGLANG_PID" "$HAPROXY_PID"
|
wait -n "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID"
|
||||||
EXIT_CODE=$?
|
EXIT_CODE=$?
|
||||||
echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH"
|
echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH"
|
||||||
kill "$SGLANG_PID" "$HAPROXY_PID" 2>/dev/null || true
|
kill "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" 2>/dev/null || true
|
||||||
exit $EXIT_CODE
|
exit $EXIT_CODE
|
||||||
|
|||||||
106
vllm_middleware.py
Normal file
106
vllm_middleware.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
"""
|
||||||
|
vLLM → SGLang request middleware.
|
||||||
|
|
||||||
|
Sits between haproxy and SGLang to strip vLLM-only parameters
|
||||||
|
that cause SGLang to return 422/400 errors.
|
||||||
|
|
||||||
|
Currently strips: logprobs, top_logprobs
|
||||||
|
(SGLang's Mistral tool-call parser rejects these; vLLM accepts them.)
|
||||||
|
|
||||||
|
Architecture:
|
||||||
|
haproxy (original port) → middleware (port+2) → SGLang (port+1)
|
||||||
|
|
||||||
|
haproxy still handles /metrics stub and /health instant responses.
|
||||||
|
This middleware only touches the proxied request bodies.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import httpx
|
||||||
|
from fastapi import FastAPI, Request
|
||||||
|
from fastapi.responses import StreamingResponse, Response
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
SGLANG_PORT = int(os.environ.get("SGLANG_PORT", "8001"))
|
||||||
|
LISTEN_PORT = int(os.environ.get("MIDDLEWARE_PORT", "8002"))
|
||||||
|
|
||||||
|
# Params that vLLM accepts but SGLang rejects.
|
||||||
|
# Extend this set as more incompatibilities are discovered.
|
||||||
|
STRIP_PARAMS = {"logprobs", "top_logprobs"}
|
||||||
|
|
||||||
|
app = FastAPI()
|
||||||
|
client: httpx.AsyncClient | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup():
|
||||||
|
global client
|
||||||
|
client = httpx.AsyncClient(
|
||||||
|
base_url=f"http://127.0.0.1:{SGLANG_PORT}",
|
||||||
|
timeout=httpx.Timeout(300.0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def shutdown():
|
||||||
|
await client.aclose()
|
||||||
|
|
||||||
|
|
||||||
|
@app.api_route("/{path:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS"])
|
||||||
|
async def proxy(path: str, request: Request):
|
||||||
|
body = await request.body()
|
||||||
|
is_streaming = False
|
||||||
|
|
||||||
|
# Strip incompatible params from chat completion POST requests
|
||||||
|
if request.method == "POST" and "chat/completions" in path and body:
|
||||||
|
try:
|
||||||
|
data = json.loads(body)
|
||||||
|
is_streaming = data.get("stream", False)
|
||||||
|
stripped_any = False
|
||||||
|
for key in STRIP_PARAMS:
|
||||||
|
if key in data:
|
||||||
|
del data[key]
|
||||||
|
stripped_any = True
|
||||||
|
if stripped_any:
|
||||||
|
body = json.dumps(data).encode()
|
||||||
|
except (json.JSONDecodeError, UnicodeDecodeError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Forward headers (skip hop-by-hop and ones we're replacing)
|
||||||
|
fwd_headers = {
|
||||||
|
k: v for k, v in request.headers.items()
|
||||||
|
if k.lower() not in ("host", "content-length", "transfer-encoding")
|
||||||
|
}
|
||||||
|
fwd_headers["content-length"] = str(len(body))
|
||||||
|
|
||||||
|
url = f"http://127.0.0.1:{SGLANG_PORT}/{path}"
|
||||||
|
if request.query_params:
|
||||||
|
url += f"?{request.query_params}"
|
||||||
|
|
||||||
|
if is_streaming:
|
||||||
|
req = client.build_request(request.method, url, content=body, headers=fwd_headers)
|
||||||
|
resp = await client.send(req, stream=True)
|
||||||
|
|
||||||
|
async def stream_body():
|
||||||
|
try:
|
||||||
|
async for chunk in resp.aiter_bytes():
|
||||||
|
yield chunk
|
||||||
|
finally:
|
||||||
|
await resp.aclose()
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
stream_body(),
|
||||||
|
status_code=resp.status_code,
|
||||||
|
headers={"content-type": resp.headers.get("content-type", "text/event-stream")},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
resp = await client.request(request.method, url, content=body, headers=fwd_headers)
|
||||||
|
return Response(
|
||||||
|
content=resp.content,
|
||||||
|
status_code=resp.status_code,
|
||||||
|
media_type=resp.headers.get("content-type"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=LISTEN_PORT, log_level="warning")
|
||||||
@@ -63,10 +63,12 @@ def main():
|
|||||||
else:
|
else:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
# SGLang runs one port higher; haproxy binds the original port
|
# SGLang runs one port higher; middleware two ports higher
|
||||||
sglang_port = str(int(port) + 1)
|
sglang_port = str(int(port) + 1)
|
||||||
|
middleware_port = str(int(port) + 2)
|
||||||
|
|
||||||
print(f"Launching SGLang on {host}:{sglang_port} (internal)")
|
print(f"Launching SGLang on {host}:{sglang_port} (internal)")
|
||||||
|
print(f"Launching middleware on {host}:{middleware_port} (strips logprobs)")
|
||||||
print(f"Launching haproxy on {host}:{port} (front door, /metrics + /health stub)")
|
print(f"Launching haproxy on {host}:{port} (front door, /metrics + /health stub)")
|
||||||
print()
|
print()
|
||||||
|
|
||||||
@@ -112,12 +114,12 @@ frontend proxy
|
|||||||
backend sglang
|
backend sglang
|
||||||
option httpchk GET /health
|
option httpchk GET /health
|
||||||
http-check expect status 200
|
http-check expect status 200
|
||||||
server s1 127.0.0.1:{sglang_port} check inter 5s fall 3 rise 2
|
server s1 127.0.0.1:{middleware_port} check inter 5s fall 3 rise 2
|
||||||
""")
|
""")
|
||||||
|
|
||||||
with open(log_path, "a") as f:
|
with open(log_path, "a") as f:
|
||||||
f.write(f"haproxy config written to {haproxy_cfg}\n")
|
f.write(f"haproxy config written to {haproxy_cfg}\n")
|
||||||
f.write(f"SGLang port: {sglang_port}, haproxy port: {port}\n")
|
f.write(f"SGLang port: {sglang_port}, middleware port: {middleware_port}, haproxy port: {port}\n")
|
||||||
|
|
||||||
# Start SGLang in the background
|
# Start SGLang in the background
|
||||||
sglang_proc = subprocess.Popen(
|
sglang_proc = subprocess.Popen(
|
||||||
@@ -131,6 +133,15 @@ backend sglang
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Start the middleware (strips vLLM-only params like logprobs)
|
||||||
|
middleware_env = os.environ.copy()
|
||||||
|
middleware_env["SGLANG_PORT"] = sglang_port
|
||||||
|
middleware_env["MIDDLEWARE_PORT"] = middleware_port
|
||||||
|
middleware_proc = subprocess.Popen(
|
||||||
|
[sys.executable, "/opt/vllm-shim/vllm_middleware.py"],
|
||||||
|
env=middleware_env,
|
||||||
|
)
|
||||||
|
|
||||||
# Give SGLang a moment before haproxy starts routing
|
# Give SGLang a moment before haproxy starts routing
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
@@ -138,19 +149,27 @@ backend sglang
|
|||||||
haproxy_proc = subprocess.Popen(["haproxy", "-f", haproxy_cfg])
|
haproxy_proc = subprocess.Popen(["haproxy", "-f", haproxy_cfg])
|
||||||
|
|
||||||
with open(log_path, "a") as f:
|
with open(log_path, "a") as f:
|
||||||
f.write(f"SGLang PID: {sglang_proc.pid}, haproxy PID: {haproxy_proc.pid}\n")
|
f.write(f"SGLang PID: {sglang_proc.pid}, middleware PID: {middleware_proc.pid}, haproxy PID: {haproxy_proc.pid}\n")
|
||||||
|
|
||||||
# Wait for whichever dies first
|
# Wait for whichever dies first
|
||||||
while True:
|
while True:
|
||||||
sglang_ret = sglang_proc.poll()
|
sglang_ret = sglang_proc.poll()
|
||||||
|
middleware_ret = middleware_proc.poll()
|
||||||
haproxy_ret = haproxy_proc.poll()
|
haproxy_ret = haproxy_proc.poll()
|
||||||
if sglang_ret is not None:
|
if sglang_ret is not None:
|
||||||
print(f"SGLang exited (code {sglang_ret}), shutting down")
|
print(f"SGLang exited (code {sglang_ret}), shutting down")
|
||||||
|
middleware_proc.terminate()
|
||||||
haproxy_proc.terminate()
|
haproxy_proc.terminate()
|
||||||
os._exit(sglang_ret)
|
os._exit(sglang_ret)
|
||||||
|
if middleware_ret is not None:
|
||||||
|
print(f"Middleware exited (code {middleware_ret}), shutting down")
|
||||||
|
sglang_proc.terminate()
|
||||||
|
haproxy_proc.terminate()
|
||||||
|
os._exit(middleware_ret)
|
||||||
if haproxy_ret is not None:
|
if haproxy_ret is not None:
|
||||||
print(f"haproxy exited (code {haproxy_ret}), shutting down")
|
print(f"haproxy exited (code {haproxy_ret}), shutting down")
|
||||||
sglang_proc.terminate()
|
sglang_proc.terminate()
|
||||||
|
middleware_proc.terminate()
|
||||||
os._exit(haproxy_ret)
|
os._exit(haproxy_ret)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user