From 7fb373fdfce898f1d19f5fe52d05d8a1401fe442 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sun, 12 Apr 2026 17:09:58 +0000 Subject: [PATCH] Add haproxy proxy: /metrics returns 200 empty, everything else proxies to SGLang SGLang now runs on port+1, haproxy binds the original vLLM port. haproxy serves a stub /metrics endpoint (200, empty body) and passes all other traffic through to SGLang via raw TCP proxy. --- Dockerfile | 6 ++++ vllm-shim.sh | 76 ++++++++++++++++++++++++++++++++++++++++++--- vllm_shim_module.py | 75 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 146 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index 5137e95..bc0aeed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,11 @@ FROM lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi30x-20260411 +# --------------------------------------------------------------- +# haproxy: routes /metrics stub, proxies everything else to SGLang +# --------------------------------------------------------------- +RUN apt-get update && apt-get install -y --no-install-recommends haproxy \ + && rm -rf /var/lib/apt/lists/* + # --------------------------------------------------------------- # Replace the vllm binary with our shim # --------------------------------------------------------------- diff --git a/vllm-shim.sh b/vllm-shim.sh index 76e1090..e52c917 100644 --- a/vllm-shim.sh +++ b/vllm-shim.sh @@ -5,6 +5,12 @@ set -euo pipefail # vLLM -> SGLang shim # This script replaces the vllm binary. The k8s production stack # calls `vllm serve [flags]`, and we intercept everything. +# +# Architecture: +# haproxy on the vLLM port (front door) +# /metrics → 200 empty response +# /* → proxy to SGLang on port+1 +# SGLang on port+1 (internal) # ============================================================ echo "" @@ -22,6 +28,20 @@ done echo "==========================================" echo "" +# Log to file +LOG_PATH="${VLLM_SHIM_LOG:-/tmp/vllm-shim.log}" +{ + echo "$(date -Iseconds) vLLM -> SGLang Shim (shell)" + echo " Invoked as: vllm $*" + echo " All arguments received:" + i=1 + for arg in "$@"; do + echo " [$i] $arg" + i=$((i + 1)) + done + echo "" +} >> "$LOG_PATH" + # Defaults HOST="0.0.0.0" PORT="8000" @@ -38,12 +58,60 @@ while [[ $# -gt 0 ]]; do esac done -echo "Launching SGLang on ${HOST}:${PORT}" +# SGLang runs one port higher; haproxy binds the original port +SGLANG_PORT=$((PORT + 1)) + +echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)" +echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics stub)" echo "" -exec python -m sglang.launch_server \ +# Write haproxy config +HAPROXY_CFG="/tmp/haproxy-shim.cfg" +cat > "$HAPROXY_CFG" <> "$LOG_PATH" + +# Start SGLang in the background +python -m sglang.launch_server \ --model-path mistralai/Devstral-2-123B-Instruct-2512 \ --host "$HOST" \ - --port "$PORT" \ + --port "$SGLANG_PORT" \ --tp 8 \ - --tool-call-parser mistral \ No newline at end of file + --tool-call-parser mistral & + +SGLANG_PID=$! + +# Give SGLang a moment to start before haproxy starts routing +sleep 2 + +# Start haproxy in the foreground (this is now PID 1 for the container) +haproxy -f "$HAPROXY_CFG" & + +HAPROXY_PID=$! + +echo "SGLang PID: ${SGLANG_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH" + +# Wait for whichever dies first — if either goes, we go +wait -n "$SGLANG_PID" "$HAPROXY_PID" +EXIT_CODE=$? +echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH" +kill "$SGLANG_PID" "$HAPROXY_PID" 2>/dev/null || true +exit $EXIT_CODE diff --git a/vllm_shim_module.py b/vllm_shim_module.py index b7666c1..8c8efb2 100644 --- a/vllm_shim_module.py +++ b/vllm_shim_module.py @@ -1,18 +1,25 @@ """ vLLM -> SGLang Python shim. Catches `python -m vllm.entrypoints.openai.api_server` (and similar) -and launches SGLang instead. +and launches SGLang behind haproxy instead. + +Architecture: + haproxy on the vLLM port (front door) + /metrics → 200 empty response + /* → proxy to SGLang on port+1 + SGLang on port+1 (internal) """ import os import sys import subprocess +import signal def main(): args = sys.argv[1:] log_path = os.environ.get("VLLM_SHIM_LOG", "/tmp/vllm-shim.log") + import datetime with open(log_path, "a") as f: - import datetime f.write(f"\n{datetime.datetime.now().isoformat()} vLLM -> SGLang Shim (Python module)\n") f.write(f" Invoked as: python -m {__name__} {' '.join(args)}\n") f.write(" All arguments received:\n") @@ -52,23 +59,77 @@ def main(): else: i += 1 - print(f"Launching SGLang on {host}:{port}") + # SGLang runs one port higher; haproxy binds the original port + sglang_port = str(int(port) + 1) + + print(f"Launching SGLang on {host}:{sglang_port} (internal)") + print(f"Launching haproxy on {host}:{port} (front door, /metrics stub)") print() - os.execvp( - sys.executable, + # Write haproxy config + haproxy_cfg = "/tmp/haproxy-shim.cfg" + with open(haproxy_cfg, "w") as f: + f.write(f"""global + log /dev/log local0 + maxconn 4096 + +defaults + mode http + timeout connect 5s + timeout client 300s + timeout server 300s + +frontend proxy + bind {host}:{port} + http-request return status 200 content-type text/plain "" if {{ path /metrics }} + default_backend sglang + +backend sglang + server s1 127.0.0.1:{sglang_port} +""") + + with open(log_path, "a") as f: + f.write(f"haproxy config written to {haproxy_cfg}\n") + f.write(f"SGLang port: {sglang_port}, haproxy port: {port}\n") + + # Start SGLang in the background + sglang_proc = subprocess.Popen( [ sys.executable, "-m", "sglang.launch_server", "--model-path", "mistralai/Devstral-2-123B-Instruct-2512", "--host", host, - "--port", port, + "--port", sglang_port, "--tp", "8", "--tool-call-parser", "mistral", ], ) + # Give SGLang a moment before haproxy starts routing + import time + time.sleep(2) + + # Start haproxy in the background + haproxy_proc = subprocess.Popen(["haproxy", "-f", haproxy_cfg]) + + with open(log_path, "a") as f: + f.write(f"SGLang PID: {sglang_proc.pid}, haproxy PID: {haproxy_proc.pid}\n") + + # Wait for whichever dies first + while True: + sglang_ret = sglang_proc.poll() + haproxy_ret = haproxy_proc.poll() + if sglang_ret is not None: + print(f"SGLang exited (code {sglang_ret}), shutting down") + haproxy_proc.terminate() + os._exit(sglang_ret) + if haproxy_ret is not None: + print(f"haproxy exited (code {haproxy_ret}), shutting down") + sglang_proc.terminate() + os._exit(haproxy_ret) + time.sleep(1) + if __name__ == "__main__": main() # Also run if imported as a module (some invocation paths just import the file) -main() \ No newline at end of file +main()