add(new ModelQuirksMiddleware()); // Add error middleware (display details in dev) $app->addErrorMiddleware(true, true, true); // Add body parsing middleware LAST (runs first, parses the body) $app->addBodyParsingMiddleware(); // ----------------------------------------------------- // Routes // ----------------------------------------------------- // Health check $app->get('/health', function (ServerRequestInterface $request, ResponseInterface $response) { $response->getBody()->write(json_encode([ 'status' => 'ok', 'service' => 'context-paging', ])); return $response->withHeader('Content-Type', 'application/json'); }); // Proxy GET /models $app->get('/v1/models', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey) { $client = new OpenAICompatibleClient($vllmUrl, $apiKey); $modelsResponse = $client->listModels(); $response->getBody()->write($modelsResponse->getBody()->getContents()); return $response->withHeader('Content-Type', 'application/json'); }); // Main endpoint: POST /v1/chat/completions $app->post('/v1/chat/completions', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey, $maxContextTokens, $redisUrl, $toolCallMode) { $body = $request->getParsedBody(); // Validate if (!isset($body['messages']) || !is_array($body['messages'])) { $response->getBody()->write(json_encode([ 'error' => ['message' => 'Missing required field: messages', 'type' => 'validation_error'], ])); return $response->withStatus(400)->withHeader('Content-Type', 'application/json'); } // Get quirks applied by middleware $quirks = $body['_quirks'] ?? []; unset($body['_quirks']); // Don't send to backend // Setup context paging $tokenCounter = new TokenCounter(); if ($redisUrl) { $cache = RedisCache::fromUrl($redisUrl); $contextPaging = new ContextPaging($tokenCounter, null, $cache, $cache); } else { $contextPaging = new ContextPaging($tokenCounter); } $contextPaging ->setMaxContextTokens($maxContextTokens) ->setResponseReserve($body['max_tokens'] ?? 4096) ->setLogFile(__DIR__ . '/output/context-paging.log'); // Override tool mode from quirks if specified if (isset($quirks['tool_mode'])) { $contextPaging->setToolCallMode(ToolCallMode::from($quirks['tool_mode'])); } else { $contextPaging->setToolCallMode($toolCallMode); } // Strip tools if model doesn't handle them well if ($quirks['strip_tools'] ?? false) { $contextPaging->setToolCallMode(ToolCallMode::NATIVE); // Use native but don't inject tools $request = $request->withAttribute('strip_tools', true); } // Build PSR-7 request for context paging $contextRequest = new \GuzzleHttp\Psr7\ServerRequest( method: 'POST', uri: '/chat/completions', headers: ['Content-Type' => 'application/json'], body: json_encode($body), version: '1.1', serverParams: $_SERVER ); $contextRequest = $contextRequest->withParsedBody($body); // Backend client $client = new OpenAICompatibleClient($vllmUrl, $apiKey); $isStreaming = ($body['stream'] ?? false) === true; try { // Loop 2: Fit context to window $fittedRequest = $contextPaging->fit($contextRequest); // Extract options $options = $body; unset($options['messages']); // Loop 3: Execute with dereference handling if ($isStreaming) { return handleStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks); } else { return handleNonStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks); } } catch (\Throwable $e) { $response->getBody()->write(json_encode([ 'error' => ['message' => $e->getMessage(), 'type' => 'server_error'], ])); return $response->withStatus(500)->withHeader('Content-Type', 'application/json'); } }); // ----------------------------------------------------- // Helpers // ----------------------------------------------------- function handleNonStreamingResponse( ResponseInterface $response, ContextPaging $contextPaging, ServerRequestInterface $request, OpenAICompatibleClient $client, array $options, array $quirks ): ResponseInterface { $shouldStripTools = $quirks['strip_tools'] ?? false; $llmResponse = $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $shouldStripTools) { // If stripping tools, remove them from options if ($shouldStripTools) { unset($options['tools'], $options['tool_choice']); } return $client->chat($messages, $options); }); $response->getBody()->write($llmResponse->getBody()->getContents()); return $response->withHeader('Content-Type', 'application/json'); } function handleStreamingResponse( ResponseInterface $response, ContextPaging $contextPaging, ServerRequestInterface $request, OpenAICompatibleClient $client, array $options, array $quirks ): ResponseInterface { $shouldStripTools = $quirks['strip_tools'] ?? false; // Disable time limit for streaming set_time_limit(0); // For streaming, we need to write directly $response = $response ->withHeader('Content-Type', 'text/event-stream') ->withHeader('Cache-Control', 'no-cache') ->withHeader('Connection', 'keep-alive') ->withHeader('X-Accel-Buffering', 'no'); $body = $response->getBody(); $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $body, $shouldStripTools) { if ($shouldStripTools) { unset($options['tools'], $options['tool_choice']); } foreach ($client->chatStream($messages, $options) as $chunk) { if (isset($chunk['error'])) { $body->write("data: " . json_encode($chunk) . "\n\n"); break; } $body->write("data: " . json_encode($chunk) . "\n\n"); } $body->write("data: [DONE]\n\n"); // Return dummy response for interface return new \GuzzleHttp\Psr7\Response(200, [], ''); }); return $response; } // ----------------------------------------------------- // Run // ----------------------------------------------------- $app->run();