context-paging/index.php

<?php

declare(strict_types=1);

require_once __DIR__ . '/vendor/autoload.php';

use ContextPaging\ContextPaging;
use ContextPaging\OpenAICompatibleClient;
use ContextPaging\TokenCounter;
use ContextPaging\RedisCache;
use ContextPaging\ToolCallMode;
use ContextPaging\Middleware\ModelQuirksMiddleware;
use Slim\Factory\AppFactory;
use Slim\Psr7\Response;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;

/**
 * Context Paging — vLLM Facade with Slim Framework
 *
 * Drop-in HTTP proxy for vLLM with automatic context window management.
 * Clients hit this as if it's vLLM; context paging happens transparently.
 *
 * Environment variables:
 *   VLLM_URL           — Backend vLLM URL (default: http://localhost:8000/v1)
 *   VLLM_API_KEY       — API key for backend (optional)
 *   MAX_CONTEXT_TOKENS — Maximum context window size (default: 128000)
 *   REDIS_URL          — Redis URL for caching (optional, in-memory if not set)
 *   TOOL_CALL_MODE     — native, raw, or auto (default: auto)
 */

// -----------------------------------------------------
// Load .env
// -----------------------------------------------------

$envFile = __DIR__ . '/.env';
if (file_exists($envFile)) {
    foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
        if (str_starts_with($line, '#') || !str_contains($line, '=')) {
            continue;
        }
        [$key, $value] = explode('=', $line, 2);
        $key = trim($key);
        $value = trim($value);
        if (getenv($key) === false) {
            putenv("{$key}={$value}");
            $_ENV[$key] = $value;
        }
    }
}

// -----------------------------------------------------
// Configuration
// -----------------------------------------------------

$vllmUrl = getenv('VLLM_URL') ?: getenv('API_BASE_URL') ?: 'http://localhost:8000/v1';
$apiKey = getenv('VLLM_API_KEY') ?: getenv('API_KEY') ?: null;
$maxContextTokens = (int) (getenv('MAX_CONTEXT_TOKENS') ?: 128000);
$redisUrl = getenv('REDIS_URL') ?: null;
$toolCallMode = ToolCallMode::from(getenv('TOOL_CALL_MODE') ?: 'auto');

// -----------------------------------------------------
// Create Slim App
// -----------------------------------------------------

$app = AppFactory::create();

// Add model quirks middleware FIRST (runs last, after body parsing)
$app->add(new ModelQuirksMiddleware());

// Add error middleware (display details in dev)
$app->addErrorMiddleware(true, true, true);

// Add body parsing middleware LAST (runs first, parses the body)
$app->addBodyParsingMiddleware();

// -----------------------------------------------------
// Routes
// -----------------------------------------------------

// Health check
$app->get('/health', function (ServerRequestInterface $request, ResponseInterface $response) {
    $response->getBody()->write(json_encode([
        'status' => 'ok',
        'service' => 'context-paging',
    ]));
    return $response->withHeader('Content-Type', 'application/json');
});

// Proxy GET /models
$app->get('/v1/models', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey) {
    $client = new OpenAICompatibleClient($vllmUrl, $apiKey);
    $modelsResponse = $client->listModels();

    $response->getBody()->write($modelsResponse->getBody()->getContents());
    return $response->withHeader('Content-Type', 'application/json');
});

// Main endpoint: POST /v1/chat/completions
$app->post('/v1/chat/completions', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey, $maxContextTokens, $redisUrl, $toolCallMode) {
    $body = $request->getParsedBody();

    // Validate
    if (!isset($body['messages']) || !is_array($body['messages'])) {
        $response->getBody()->write(json_encode([
            'error' => ['message' => 'Missing required field: messages', 'type' => 'validation_error'],
        ]));
        return $response->withStatus(400)->withHeader('Content-Type', 'application/json');
    }

    // Get quirks applied by middleware
    $quirks = $body['_quirks'] ?? [];
    unset($body['_quirks']); // Don't send to backend

    // Setup context paging
    $tokenCounter = new TokenCounter();

    if ($redisUrl) {
        $cache = RedisCache::fromUrl($redisUrl);
        $contextPaging = new ContextPaging($tokenCounter, null, $cache, $cache);
    } else {
        $contextPaging = new ContextPaging($tokenCounter);
    }

    $contextPaging
        ->setMaxContextTokens($maxContextTokens)
        ->setResponseReserve($body['max_tokens'] ?? 4096)
        ->setLogFile(__DIR__ . '/output/context-paging.log');

    // Override tool mode from quirks if specified
    if (isset($quirks['tool_mode'])) {
        $contextPaging->setToolCallMode(ToolCallMode::from($quirks['tool_mode']));
    } else {
        $contextPaging->setToolCallMode($toolCallMode);
    }

    // Strip tools if model doesn't handle them well
    if ($quirks['strip_tools'] ?? false) {
        $contextPaging->setToolCallMode(ToolCallMode::NATIVE); // Use native but don't inject tools
        $request = $request->withAttribute('strip_tools', true);
    }

    // Build PSR-7 request for context paging
    $contextRequest = new \GuzzleHttp\Psr7\ServerRequest(
        method: 'POST',
        uri: '/chat/completions',
        headers: ['Content-Type' => 'application/json'],
        body: json_encode($body),
        version: '1.1',
        serverParams: $_SERVER
    );
    $contextRequest = $contextRequest->withParsedBody($body);

    // Backend client
    $client = new OpenAICompatibleClient($vllmUrl, $apiKey);
    $isStreaming = ($body['stream'] ?? false) === true;

    try {
        // Loop 2: Fit context to window
        $fittedRequest = $contextPaging->fit($contextRequest);

        // Extract options
        $options = $body;
        unset($options['messages']);

        // Loop 3: Execute with dereference handling
        if ($isStreaming) {
            return handleStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
        } else {
            return handleNonStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
        }

    } catch (\Throwable $e) {
        $response->getBody()->write(json_encode([
            'error' => ['message' => $e->getMessage(), 'type' => 'server_error'],
        ]));
        return $response->withStatus(500)->withHeader('Content-Type', 'application/json');
    }
});

// -----------------------------------------------------
// Helpers
// -----------------------------------------------------

function handleNonStreamingResponse(
    ResponseInterface $response,
    ContextPaging $contextPaging,
    ServerRequestInterface $request,
    OpenAICompatibleClient $client,
    array $options,
    array $quirks
): ResponseInterface {
    $shouldStripTools = $quirks['strip_tools'] ?? false;

    $llmResponse = $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $shouldStripTools) {
        // If stripping tools, remove them from options
        if ($shouldStripTools) {
            unset($options['tools'], $options['tool_choice']);
        }
        return $client->chat($messages, $options);
    });

    $response->getBody()->write($llmResponse->getBody()->getContents());
    return $response->withHeader('Content-Type', 'application/json');
}

function handleStreamingResponse(
    ResponseInterface $response,
    ContextPaging $contextPaging,
    ServerRequestInterface $request,
    OpenAICompatibleClient $client,
    array $options,
    array $quirks
): ResponseInterface {
    $shouldStripTools = $quirks['strip_tools'] ?? false;

    // Disable time limit for streaming
    set_time_limit(0);

    // For streaming, we need to write directly
    $response = $response
        ->withHeader('Content-Type', 'text/event-stream')
        ->withHeader('Cache-Control', 'no-cache')
        ->withHeader('Connection', 'keep-alive')
        ->withHeader('X-Accel-Buffering', 'no');

    $body = $response->getBody();

    $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $body, $shouldStripTools) {
        if ($shouldStripTools) {
            unset($options['tools'], $options['tool_choice']);
        }

        foreach ($client->chatStream($messages, $options) as $chunk) {
            if (isset($chunk['error'])) {
                $body->write("data: " . json_encode($chunk) . "\n\n");
                break;
            }

            $body->write("data: " . json_encode($chunk) . "\n\n");
        }

        $body->write("data: [DONE]\n\n");

        // Return dummy response for interface
        return new \GuzzleHttp\Psr7\Response(200, [], '');
    });

    return $response;
}

// -----------------------------------------------------
// Run
// -----------------------------------------------------

$app->run();