257 lines
8.8 KiB
PHP
257 lines
8.8 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
require_once __DIR__ . '/vendor/autoload.php';
|
|
|
|
use ContextPaging\ContextPaging;
|
|
use ContextPaging\OpenAICompatibleClient;
|
|
use ContextPaging\TokenCounter;
|
|
use ContextPaging\RedisCache;
|
|
use ContextPaging\ToolCallMode;
|
|
use ContextPaging\Middleware\ModelQuirksMiddleware;
|
|
use Slim\Factory\AppFactory;
|
|
use Slim\Psr7\Response;
|
|
use Psr\Http\Message\ResponseInterface;
|
|
use Psr\Http\Message\ServerRequestInterface;
|
|
|
|
/**
|
|
* Context Paging — vLLM Facade with Slim Framework
|
|
*
|
|
* Drop-in HTTP proxy for vLLM with automatic context window management.
|
|
* Clients hit this as if it's vLLM; context paging happens transparently.
|
|
*
|
|
* Environment variables:
|
|
* VLLM_URL — Backend vLLM URL (default: http://localhost:8000/v1)
|
|
* VLLM_API_KEY — API key for backend (optional)
|
|
* MAX_CONTEXT_TOKENS — Maximum context window size (default: 128000)
|
|
* REDIS_URL — Redis URL for caching (optional, in-memory if not set)
|
|
* TOOL_CALL_MODE — native, raw, or auto (default: auto)
|
|
*/
|
|
|
|
// -----------------------------------------------------
|
|
// Load .env
|
|
// -----------------------------------------------------
|
|
|
|
$envFile = __DIR__ . '/.env';
|
|
if (file_exists($envFile)) {
|
|
foreach (file($envFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES) as $line) {
|
|
if (str_starts_with($line, '#') || !str_contains($line, '=')) {
|
|
continue;
|
|
}
|
|
[$key, $value] = explode('=', $line, 2);
|
|
$key = trim($key);
|
|
$value = trim($value);
|
|
if (getenv($key) === false) {
|
|
putenv("{$key}={$value}");
|
|
$_ENV[$key] = $value;
|
|
}
|
|
}
|
|
}
|
|
|
|
// -----------------------------------------------------
|
|
// Configuration
|
|
// -----------------------------------------------------
|
|
|
|
$vllmUrl = getenv('VLLM_URL') ?: getenv('API_BASE_URL') ?: 'http://localhost:8000/v1';
|
|
$apiKey = getenv('VLLM_API_KEY') ?: getenv('API_KEY') ?: null;
|
|
$maxContextTokens = (int) (getenv('MAX_CONTEXT_TOKENS') ?: 128000);
|
|
$redisUrl = getenv('REDIS_URL') ?: null;
|
|
$toolCallMode = ToolCallMode::from(getenv('TOOL_CALL_MODE') ?: 'auto');
|
|
|
|
// -----------------------------------------------------
|
|
// Create Slim App
|
|
// -----------------------------------------------------
|
|
|
|
$app = AppFactory::create();
|
|
|
|
// Add model quirks middleware FIRST (runs last, after body parsing)
|
|
$app->add(new ModelQuirksMiddleware());
|
|
|
|
// Add error middleware (display details in dev)
|
|
$app->addErrorMiddleware(true, true, true);
|
|
|
|
// Add body parsing middleware LAST (runs first, parses the body)
|
|
$app->addBodyParsingMiddleware();
|
|
|
|
// -----------------------------------------------------
|
|
// Routes
|
|
// -----------------------------------------------------
|
|
|
|
// Health check
|
|
$app->get('/health', function (ServerRequestInterface $request, ResponseInterface $response) {
|
|
$response->getBody()->write(json_encode([
|
|
'status' => 'ok',
|
|
'service' => 'context-paging',
|
|
]));
|
|
return $response->withHeader('Content-Type', 'application/json');
|
|
});
|
|
|
|
// Proxy GET /models
|
|
$app->get('/v1/models', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey) {
|
|
$client = new OpenAICompatibleClient($vllmUrl, $apiKey);
|
|
$modelsResponse = $client->listModels();
|
|
|
|
$response->getBody()->write($modelsResponse->getBody()->getContents());
|
|
return $response->withHeader('Content-Type', 'application/json');
|
|
});
|
|
|
|
// Main endpoint: POST /v1/chat/completions
|
|
$app->post('/v1/chat/completions', function (ServerRequestInterface $request, ResponseInterface $response) use ($vllmUrl, $apiKey, $maxContextTokens, $redisUrl, $toolCallMode) {
|
|
$body = $request->getParsedBody();
|
|
|
|
// Validate
|
|
if (!isset($body['messages']) || !is_array($body['messages'])) {
|
|
$response->getBody()->write(json_encode([
|
|
'error' => ['message' => 'Missing required field: messages', 'type' => 'validation_error'],
|
|
]));
|
|
return $response->withStatus(400)->withHeader('Content-Type', 'application/json');
|
|
}
|
|
|
|
// Get quirks applied by middleware
|
|
$quirks = $body['_quirks'] ?? [];
|
|
unset($body['_quirks']); // Don't send to backend
|
|
|
|
// Setup context paging
|
|
$tokenCounter = new TokenCounter();
|
|
|
|
if ($redisUrl) {
|
|
$cache = RedisCache::fromUrl($redisUrl);
|
|
$contextPaging = new ContextPaging($tokenCounter, null, $cache, $cache);
|
|
} else {
|
|
$contextPaging = new ContextPaging($tokenCounter);
|
|
}
|
|
|
|
$contextPaging
|
|
->setMaxContextTokens($maxContextTokens)
|
|
->setResponseReserve($body['max_tokens'] ?? 4096)
|
|
->setLogFile(__DIR__ . '/output/context-paging.log');
|
|
|
|
// Override tool mode from quirks if specified
|
|
if (isset($quirks['tool_mode'])) {
|
|
$contextPaging->setToolCallMode(ToolCallMode::from($quirks['tool_mode']));
|
|
} else {
|
|
$contextPaging->setToolCallMode($toolCallMode);
|
|
}
|
|
|
|
// Strip tools if model doesn't handle them well
|
|
if ($quirks['strip_tools'] ?? false) {
|
|
$contextPaging->setToolCallMode(ToolCallMode::NATIVE); // Use native but don't inject tools
|
|
$request = $request->withAttribute('strip_tools', true);
|
|
}
|
|
|
|
// Build PSR-7 request for context paging
|
|
$contextRequest = new \GuzzleHttp\Psr7\ServerRequest(
|
|
method: 'POST',
|
|
uri: '/chat/completions',
|
|
headers: ['Content-Type' => 'application/json'],
|
|
body: json_encode($body),
|
|
version: '1.1',
|
|
serverParams: $_SERVER
|
|
);
|
|
$contextRequest = $contextRequest->withParsedBody($body);
|
|
|
|
// Backend client
|
|
$client = new OpenAICompatibleClient($vllmUrl, $apiKey);
|
|
$isStreaming = ($body['stream'] ?? false) === true;
|
|
|
|
try {
|
|
// Loop 2: Fit context to window
|
|
$fittedRequest = $contextPaging->fit($contextRequest);
|
|
|
|
// Extract options
|
|
$options = $body;
|
|
unset($options['messages']);
|
|
|
|
// Loop 3: Execute with dereference handling
|
|
if ($isStreaming) {
|
|
return handleStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
|
|
} else {
|
|
return handleNonStreamingResponse($response, $contextPaging, $fittedRequest, $client, $options, $quirks);
|
|
}
|
|
|
|
} catch (\Throwable $e) {
|
|
$response->getBody()->write(json_encode([
|
|
'error' => ['message' => $e->getMessage(), 'type' => 'server_error'],
|
|
]));
|
|
return $response->withStatus(500)->withHeader('Content-Type', 'application/json');
|
|
}
|
|
});
|
|
|
|
// -----------------------------------------------------
|
|
// Helpers
|
|
// -----------------------------------------------------
|
|
|
|
function handleNonStreamingResponse(
|
|
ResponseInterface $response,
|
|
ContextPaging $contextPaging,
|
|
ServerRequestInterface $request,
|
|
OpenAICompatibleClient $client,
|
|
array $options,
|
|
array $quirks
|
|
): ResponseInterface {
|
|
$shouldStripTools = $quirks['strip_tools'] ?? false;
|
|
|
|
$llmResponse = $contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $shouldStripTools) {
|
|
// If stripping tools, remove them from options
|
|
if ($shouldStripTools) {
|
|
unset($options['tools'], $options['tool_choice']);
|
|
}
|
|
return $client->chat($messages, $options);
|
|
});
|
|
|
|
$response->getBody()->write($llmResponse->getBody()->getContents());
|
|
return $response->withHeader('Content-Type', 'application/json');
|
|
}
|
|
|
|
function handleStreamingResponse(
|
|
ResponseInterface $response,
|
|
ContextPaging $contextPaging,
|
|
ServerRequestInterface $request,
|
|
OpenAICompatibleClient $client,
|
|
array $options,
|
|
array $quirks
|
|
): ResponseInterface {
|
|
$shouldStripTools = $quirks['strip_tools'] ?? false;
|
|
|
|
// Disable time limit for streaming
|
|
set_time_limit(0);
|
|
|
|
// For streaming, we need to write directly
|
|
$response = $response
|
|
->withHeader('Content-Type', 'text/event-stream')
|
|
->withHeader('Cache-Control', 'no-cache')
|
|
->withHeader('Connection', 'keep-alive')
|
|
->withHeader('X-Accel-Buffering', 'no');
|
|
|
|
$body = $response->getBody();
|
|
|
|
$contextPaging->execute($request, function (array $messages, $req) use ($client, $options, $body, $shouldStripTools) {
|
|
if ($shouldStripTools) {
|
|
unset($options['tools'], $options['tool_choice']);
|
|
}
|
|
|
|
foreach ($client->chatStream($messages, $options) as $chunk) {
|
|
if (isset($chunk['error'])) {
|
|
$body->write("data: " . json_encode($chunk) . "\n\n");
|
|
break;
|
|
}
|
|
|
|
$body->write("data: " . json_encode($chunk) . "\n\n");
|
|
}
|
|
|
|
$body->write("data: [DONE]\n\n");
|
|
|
|
// Return dummy response for interface
|
|
return new \GuzzleHttp\Psr7\Response(200, [], '');
|
|
});
|
|
|
|
return $response;
|
|
}
|
|
|
|
// -----------------------------------------------------
|
|
// Run
|
|
// -----------------------------------------------------
|
|
|
|
$app->run();
|