#!/bin/bash set -euo pipefail # ============================================================ # vLLM -> SGLang shim # This script replaces the vllm binary. The k8s production stack # calls `vllm serve [flags]`, and we intercept everything. # # Architecture: # haproxy on the vLLM port (front door) # /metrics → 200 empty (stub) # /health → 200 if SGLang backend is up, 503 if not (instant) # /* → proxy to SGLang on port+1 # SGLang on port+1 (internal) # # haproxy 2.4 compat: uses errorfile for stub responses instead # of http-request return (which needs 2.8+ for payload syntax). # ============================================================ echo "" echo "==========================================" echo " vLLM -> SGLang Shim" echo "==========================================" echo " Invoked as: vllm $*" echo "" echo " All arguments received:" i=1 for arg in "$@"; do echo " [$i] $arg" i=$((i + 1)) done echo "==========================================" echo "" # Log to file LOG_PATH="${VLLM_SHIM_LOG:-/tmp/vllm-shim.log}" { echo "$(date -Iseconds) vLLM -> SGLang Shim (shell)" echo " Invoked as: vllm $*" echo " All arguments received:" i=1 for arg in "$@"; do echo " [$i] $arg" i=$((i + 1)) done echo "" } >> "$LOG_PATH" # Defaults HOST="0.0.0.0" PORT="8000" # Parse host and port from whatever the stack sends while [[ $# -gt 0 ]]; do case "$1" in serve) shift ;; # skip the 'serve' subcommand --host) HOST="$2"; shift 2 ;; --host=*) HOST="${1#*=}"; shift ;; --port) PORT="$2"; shift 2 ;; --port=*) PORT="${1#*=}"; shift ;; *) shift ;; # ignore everything else esac done # SGLang runs one port higher; haproxy binds the original port # Middleware runs two ports higher (strips vLLM-only params) SGLANG_PORT=$((PORT + 1)) MIDDLEWARE_PORT=$((PORT + 2)) echo "Launching SGLang on ${HOST}:${SGLANG_PORT} (internal)" echo "Launching middleware on ${HOST}:${MIDDLEWARE_PORT} (strips logprobs)" echo "Launching haproxy on ${HOST}:${PORT} (front door, /metrics + /health stub)" echo "" # Prepare error files for haproxy stub responses # haproxy errorfile format: HTTP/1.x status_code reason\r\nheaders\r\n\r\nbody mkdir -p /tmp/haproxy-errors printf "HTTP/1.0 200 OK\r\nContent-Length: 0\r\nConnection: close\r\n\r\n" > /tmp/haproxy-errors/200-empty.http printf "HTTP/1.0 503 Service Unavailable\r\nContent-Length: 16\r\nConnection: close\r\nContent-Type: text/plain\r\n\r\nSGLang not ready" > /tmp/haproxy-errors/503-sglang.http # Write haproxy config (compatible with haproxy 2.4) HAPROXY_CFG="/tmp/haproxy-shim.cfg" cat > "$HAPROXY_CFG" <> "$LOG_PATH" # Start SGLang in the background python -m sglang.launch_server \ --model-path mistralai/Devstral-2-123B-Instruct-2512 \ --host "$HOST" \ --port "$SGLANG_PORT" \ --tp 8 \ --tool-call-parser mistral & SGLANG_PID=$! # Start the middleware (strips vLLM-only params like logprobs) SGLANG_PORT=$SGLANG_PORT MIDDLEWARE_PORT=$MIDDLEWARE_PORT \ python /opt/vllm-shim/vllm_middleware.py & MIDDLEWARE_PID=$! # Give SGLang a moment to start before haproxy starts routing sleep 2 # Start haproxy in the foreground (this is now PID 1 for the container) haproxy -f "$HAPROXY_CFG" & HAPROXY_PID=$! echo "SGLang PID: ${SGLANG_PID}, middleware PID: ${MIDDLEWARE_PID}, haproxy PID: ${HAPROXY_PID}" >> "$LOG_PATH" # Wait for whichever dies first — if either goes, we go wait -n "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" EXIT_CODE=$? echo "A process exited (code ${EXIT_CODE}), shutting down" >> "$LOG_PATH" kill "$SGLANG_PID" "$MIDDLEWARE_PID" "$HAPROXY_PID" 2>/dev/null || true exit $EXIT_CODE