Files
context-paging/index.php
2026-03-28 09:01:07 +00:00

257 lines
8.8 KiB
PHP

<?php
declare(strict_types=1);
require_once __DIR__ . '/vendor/autoload.php';
use ContextPaging\ContextPaging;
use ContextPaging\OpenAICompatibleClient;
use ContextPaging\TokenCounter;
use ContextPaging\RedisCache;
use ContextPaging\ToolCallMode;
use ContextPaging\Middleware\ModelQuirksMiddleware;
use Slim\Factory\AppFactory;
use Slim\Psr7\Response;
use Psr\Http\Message\ResponseInterface;
use Psr\Http\Message\ServerRequestInterface;
/**
* Context Paging — vLLM Facade with Slim Framework
*
* Drop-in HTTP proxy for vLLM with automatic context window management.
* Clients hit this as if it's vLLM; context paging happens transparently.
*
* Environment variables:
* VLLM_URL — Backend vLLM URL (default: http://localhost:8000/v1)
* VLLM_API_KEY — API key for backend (optional)
* MAX_CONTEXT_TOKENS — Maximum context window size (default: 128000)
* REDIS_URL — Redis URL for caching (optional, in-memory if not set)
* TOOL_CALL_MODE — native, raw, or auto (default: auto)
*/
// -----------------------------------------------------
// Load .env
// -----------------------------------------------------
$envFile = __DIR__ . '/.env';
if (file_exists($envFile)) {
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
if (str_starts_with($line, '#') || !str_contains($line, '=')) {
continue;
}
[$key, $value] = explode('=', $line, 2);
$key = trim($key);
$value = trim($value);
if (getenv($key) === false) {
putenv("{$key}={$value}");
$_ENV[$key] = $value;
}
}
}
// -----------------------------------------------------
// Configuration
// -----------------------------------------------------
$vllmUrl = getenv('VLLM_URL') ?: getenv('API_BASE_URL') ?: 'http://localhost:8000/v1';
$apiKey = getenv('VLLM_API_KEY') ?: getenv('API_KEY') ?: null;
$maxContextTokens = (int) (getenv('MAX_CONTEXT_TOKENS') ?: 128000);
$redisUrl = getenv('REDIS_URL') ?: null;
$toolCallMode = ToolCallMode::from(getenv('TOOL_CALL_MODE') ?: 'auto');
// -----------------------------------------------------
// Create Slim App
// -----------------------------------------------------
$app = AppFactory::create();
// Add model quirks middleware FIRST (runs last, after body parsing)
$app->add(new ModelQuirksMiddleware());
// Add error middleware (display details in dev)
$app->addErrorMiddleware(true, true, true);
// Add body parsing middleware LAST (runs first, parses the body)
$app->addBodyParsingMiddleware();
// -----------------------------------------------------
// Routes
// -----------------------------------------------------
// Health check
$app->get('/health', function (ServerRequestInterface $request, ResponseInterface $response) {
$response->getBody()->write(json_encode([
'status' => 'ok',
'service' => 'context-paging',
]));
return $response->withHeader('Content-Type', 'application/json');
});
// Proxy GET /models
$app->get('/v1/models', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey) {
$client = new OpenAICompatibleClient($vllmUrl, $apiKey);
$modelsResponse = $client->listModels();
$response->getBody()->write($modelsResponse->getBody()->getContents());
return $response->withHeader('Content-Type', 'application/json');
});
// Main endpoint: POST /v1/chat/completions
$app->post('/v1/chat/completions', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey, $maxContextTokens, $redisUrl, $toolCallMode) {
$body = $request->getParsedBody();
// Validate
if (!isset($body['messages']) || !is_array($body['messages'])) {
$response->getBody()->write(json_encode([
'error' => ['message' => 'Missing required field: messages', 'type' => 'validation_error'],
]));
return $response->withStatus(400)->withHeader('Content-Type', 'application/json');
}
// Get quirks applied by middleware
$quirks = $body['_quirks'] ?? [];
unset($body['_quirks']); // Don't send to backend
// Setup context paging
$tokenCounter = new TokenCounter();
if ($redisUrl) {
$cache = RedisCache::fromUrl($redisUrl);
$contextPaging = new ContextPaging($tokenCounter, null, $cache, $cache);
} else {
$contextPaging = new ContextPaging($tokenCounter);
}
$contextPaging
->setMaxContextTokens($maxContextTokens)
->setResponseReserve($body['max_tokens'] ?? 4096)
->setLogFile(__DIR__ . '/output/context-paging.log');
// Override tool mode from quirks if specified
if (isset($quirks['tool_mode'])) {
$contextPaging->setToolCallMode(ToolCallMode::from($quirks['tool_mode']));
} else {
$contextPaging->setToolCallMode($toolCallMode);
}
// Strip tools if model doesn't handle them well
if ($quirks['strip_tools'] ?? false) {
$contextPaging->setToolCallMode(ToolCallMode::NATIVE); // Use native but don't inject tools
$request = $request->withAttribute('strip_tools', true);
}
// Build PSR-7 request for context paging
$contextRequest = new \GuzzleHttp\Psr7\ServerRequest(
method: 'POST',
uri: '/chat/completions',
headers: ['Content-Type' => 'application/json'],
body: json_encode($body),
version: '1.1',
serverParams: $_SERVER
);
$contextRequest = $contextRequest->withParsedBody($body);
// Backend client
$client = new OpenAICompatibleClient($vllmUrl, $apiKey);
$isStreaming = ($body['stream'] ?? false) === true;
try {
// Loop 2: Fit context to window
$fittedRequest = $contextPaging->fit($contextRequest);
// Extract options
$options = $body;
unset($options['messages']);
// Loop 3: Execute with dereference handling
if ($isStreaming) {
return handleStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
} else {
return handleNonStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
}
} catch (\Throwable $e) {
$response->getBody()->write(json_encode([
'error' => ['message' => $e->getMessage(), 'type' => 'server_error'],
]));
return $response->withStatus(500)->withHeader('Content-Type', 'application/json');
}
});
// -----------------------------------------------------
// Helpers
// -----------------------------------------------------
function handleNonStreamingResponse(
ResponseInterface $response,
ContextPaging $contextPaging,
ServerRequestInterface $request,
OpenAICompatibleClient $client,
array $options,
array $quirks
): ResponseInterface {
$shouldStripTools = $quirks['strip_tools'] ?? false;
$llmResponse = $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $shouldStripTools) {
// If stripping tools, remove them from options
if ($shouldStripTools) {
unset($options['tools'], $options['tool_choice']);
}
return $client->chat($messages, $options);
});
$response->getBody()->write($llmResponse->getBody()->getContents());
return $response->withHeader('Content-Type', 'application/json');
}
function handleStreamingResponse(
ResponseInterface $response,
ContextPaging $contextPaging,
ServerRequestInterface $request,
OpenAICompatibleClient $client,
array $options,
array $quirks
): ResponseInterface {
$shouldStripTools = $quirks['strip_tools'] ?? false;
// Disable time limit for streaming
set_time_limit(0);
// For streaming, we need to write directly
$response = $response
->withHeader('Content-Type', 'text/event-stream')
->withHeader('Cache-Control', 'no-cache')
->withHeader('Connection', 'keep-alive')
->withHeader('X-Accel-Buffering', 'no');
$body = $response->getBody();
$contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $body, $shouldStripTools) {
if ($shouldStripTools) {
unset($options['tools'], $options['tool_choice']);
}
foreach ($client->chatStream($messages, $options) as $chunk) {
if (isset($chunk['error'])) {
$body->write("data: " . json_encode($chunk) . "\n\n");
break;
}
$body->write("data: " . json_encode($chunk) . "\n\n");
}
$body->write("data: [DONE]\n\n");
// Return dummy response for interface
return new \GuzzleHttp\Psr7\Response(200, [], '');
});
return $response;
}
// -----------------------------------------------------
// Run
// -----------------------------------------------------
$app->run();