615 lines
19 KiB
PHP
615 lines
19 KiB
PHP
<?php
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace ContextPaging;
|
|
|
|
use Psr\Http\Message\ServerRequestInterface;
|
|
use Psr\Http\Message\ResponseInterface;
|
|
|
|
/**
|
|
* Context Paging — Virtual memory for LLM context windows.
|
|
*
|
|
* Loop 2: fit() — compress messages until they fit the context window.
|
|
* Loop 3: execute() — run LLM, handle dereference tool calls.
|
|
*/
|
|
class ContextPaging
|
|
{
|
|
/**
|
|
* Maximum context tokens for the model.
|
|
*/
|
|
private int $maxContextTokens = 128000;
|
|
|
|
/**
|
|
* Log file path for internal events (optional).
|
|
*/
|
|
private ?string $logFile = null;
|
|
|
|
/**
|
|
* Request ID for correlating log entries.
|
|
*/
|
|
private string $requestId;
|
|
|
|
/**
|
|
* Tokens reserved for the response.
|
|
*/
|
|
private int $responseReserve = 4096;
|
|
|
|
/**
|
|
* Safety margin for token counting discrepancies.
|
|
* Different tokenizers (tiktoken vs vLLM) may count slightly differently,
|
|
* plus there's overhead for message formatting. This buffer prevents
|
|
* edge cases where we think we fit but the API rejects us.
|
|
*/
|
|
private int $safetyMargin = 500;
|
|
|
|
/**
|
|
* Cache for original messages (the "disk" backing virtual memory).
|
|
* Keyed by MD5 hash → full message array.
|
|
*/
|
|
private CacheInterface $messageStore;
|
|
|
|
/**
|
|
* Summary cache (MD5 of original → summary text).
|
|
*/
|
|
private CacheInterface $summaryCache;
|
|
|
|
/**
|
|
* Token counter instance.
|
|
*/
|
|
private TokenCounter $tokenCounter;
|
|
|
|
/**
|
|
* Tool call parser instance.
|
|
*/
|
|
private ToolCallParser $toolCallParser;
|
|
|
|
/**
|
|
* Tool formatter instance.
|
|
*/
|
|
private ToolFormatter $toolFormatter;
|
|
|
|
/**
|
|
* Tool call mode (NATIVE, RAW, or AUTO).
|
|
*/
|
|
private ToolCallMode $toolCallMode = ToolCallMode::AUTO;
|
|
|
|
/**
|
|
* Summarizer instance (optional).
|
|
*/
|
|
private ?SummarizerInterface $summarizer = null;
|
|
|
|
/**
|
|
* @param TokenCounter|null $tokenCounter
|
|
* @param SummarizerInterface|null $summarizer
|
|
* @param CacheInterface|null $messageStore Cache for original messages (default: in-memory)
|
|
* @param CacheInterface|null $summaryCache Cache for summaries (default: in-memory)
|
|
*/
|
|
public function __construct(
|
|
?TokenCounter $tokenCounter = null,
|
|
?SummarizerInterface $summarizer = null,
|
|
?CacheInterface $messageStore = null,
|
|
?CacheInterface $summaryCache = null
|
|
) {
|
|
$this->tokenCounter = $tokenCounter ?? new TokenCounter();
|
|
$this->summarizer = $summarizer;
|
|
$this->messageStore = $messageStore ?? new InMemoryCache();
|
|
$this->summaryCache = $summaryCache ?? new InMemoryCache();
|
|
$this->toolCallParser = new ToolCallParser($this->toolCallMode);
|
|
$this->toolFormatter = new ToolFormatter($this->toolCallMode);
|
|
$this->requestId = substr(md5(uniqid('', true)), 0, 8);
|
|
}
|
|
|
|
/**
|
|
* Set the log file path.
|
|
*/
|
|
public function setLogFile(string $path): self
|
|
{
|
|
$this->logFile = $path;
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* Log an event to the log file.
|
|
*/
|
|
private function log(string $event, array $data = []): void
|
|
{
|
|
if ($this->logFile === null) {
|
|
return;
|
|
}
|
|
|
|
$entry = json_encode(array_merge(
|
|
['timestamp' => date('Y-m-d H:i:s'), 'request_id' => $this->requestId, 'event' => $event],
|
|
$data
|
|
)) . "\n";
|
|
|
|
file_put_contents($this->logFile, $entry, FILE_APPEND | LOCK_EX);
|
|
}
|
|
|
|
/**
|
|
* Set the summarizer.
|
|
*/
|
|
public function setSummarizer(SummarizerInterface $summarizer): self
|
|
{
|
|
$this->summarizer = $summarizer;
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* Set the message store cache.
|
|
*/
|
|
public function setMessageStore(CacheInterface $cache): self
|
|
{
|
|
$this->messageStore = $cache;
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* Set the summary cache.
|
|
*/
|
|
public function setSummaryCache(CacheInterface $cache): self
|
|
{
|
|
$this->summaryCache = $cache;
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* Set the tool call mode.
|
|
*/
|
|
public function setToolCallMode(ToolCallMode $mode): self
|
|
{
|
|
$this->toolCallMode = $mode;
|
|
$this->toolCallParser->setMode($mode);
|
|
$this->toolFormatter->setMode($mode);
|
|
return $this;
|
|
}
|
|
|
|
/**
|
|
* Get the current tool call mode.
|
|
*/
|
|
public function getToolCallMode(): ToolCallMode
|
|
{
|
|
return $this->toolCallMode;
|
|
}
|
|
|
|
/**
|
|
* LOOP 2 — Fit the context to the window.
|
|
*/
|
|
public function fit(ServerRequestInterface $request): ServerRequestInterface
|
|
{
|
|
$body = $request->getParsedBody();
|
|
$messages = $body['messages'] ?? [];
|
|
|
|
if (empty($messages)) {
|
|
return $request;
|
|
}
|
|
|
|
// Store originals for dereferencing
|
|
$this->storeOriginals($messages);
|
|
|
|
// Get max_tokens from request, fall back to responseReserve
|
|
$maxTokens = $body['max_tokens'] ?? $this->responseReserve;
|
|
|
|
// Calculate current token count and budget
|
|
// Safety margin accounts for tokenizer discrepancies and message overhead
|
|
$tokens = $this->countTokens($messages);
|
|
$budget = $this->maxContextTokens - $maxTokens - $this->safetyMargin;
|
|
|
|
$this->log('fit_start', [
|
|
'message_count' => count($messages),
|
|
'original_tokens' => $tokens,
|
|
'budget' => $budget,
|
|
'max_context' => $this->maxContextTokens,
|
|
'response_reserve' => $maxTokens,
|
|
'needs_compression' => $tokens > $budget,
|
|
]);
|
|
|
|
// Already fits? Done.
|
|
if ($tokens <= $budget) {
|
|
$this->log('fit_skip', ['reason' => 'already_within_budget']);
|
|
return $request->withAttribute('context_fitted', true)
|
|
->withAttribute('context_tokens', $tokens)
|
|
->withAttribute('context_budget', $budget);
|
|
}
|
|
|
|
// Summarize oldest messages until we fit
|
|
$messages = $this->summarizeToFit($messages, $budget, $tokens);
|
|
|
|
// Rebuild the request with fitted messages
|
|
$body['messages'] = $messages;
|
|
|
|
$newTokens = $this->countTokens($messages);
|
|
|
|
$this->log('fit_complete', [
|
|
'original_tokens' => $tokens,
|
|
'fitted_tokens' => $newTokens,
|
|
'saved_tokens' => $tokens - $newTokens,
|
|
'compression_ratio' => round(($tokens - $newTokens) / $tokens * 100, 1) . '%',
|
|
]);
|
|
|
|
return $request->withParsedBody($body)
|
|
->withAttribute('context_fitted', true)
|
|
->withAttribute('context_tokens', $newTokens)
|
|
->withAttribute('context_budget', $budget)
|
|
->withAttribute('original_token_count', $tokens);
|
|
}
|
|
|
|
/**
|
|
* LOOP 3 — Execute with dereference handling.
|
|
*/
|
|
public function execute(ServerRequestInterface $request, callable $llmInvoker): ResponseInterface
|
|
{
|
|
$messages = $request->getParsedBody()['messages'] ?? [];
|
|
$options = $this->extractOptions($request);
|
|
|
|
// Add the fetch_message tool to the request
|
|
$payload = $this->toolFormatter->buildPayload(
|
|
$messages,
|
|
$options,
|
|
[ToolFormatter::FETCH_MESSAGE_TOOL],
|
|
$this->toolCallMode
|
|
);
|
|
|
|
$iteration = 0;
|
|
$maxIterations = 10;
|
|
$response = null;
|
|
|
|
$this->log('execute_start', [
|
|
'message_count' => count($messages),
|
|
'tool_mode' => $this->toolCallMode->value,
|
|
]);
|
|
|
|
while ($iteration < $maxIterations) {
|
|
// Memory dump: log context state before each LLM call
|
|
$this->logMemoryDump($payload['messages'], $iteration);
|
|
|
|
$response = $llmInvoker($payload['messages'], $payload);
|
|
|
|
$responseBody = $response->getBody()->getContents();
|
|
$responseData = json_decode($responseBody, true);
|
|
|
|
$response = new \GuzzleHttp\Psr7\Response(
|
|
$response->getStatusCode(),
|
|
$response->getHeaders(),
|
|
$responseBody
|
|
);
|
|
|
|
if ($iteration === 0 && $this->toolCallMode === ToolCallMode::AUTO) {
|
|
$detectedMode = $this->toolCallParser->detectMode($responseData ?? []);
|
|
$this->toolCallParser->setMode($detectedMode);
|
|
$this->toolFormatter->setMode($detectedMode);
|
|
$this->log('tool_mode_detected', ['mode' => $detectedMode->value]);
|
|
}
|
|
|
|
$toolCalls = $this->toolCallParser->extract($responseData ?? []);
|
|
|
|
if ($toolCalls === null) {
|
|
$this->log('execute_complete', [
|
|
'iterations' => $iteration,
|
|
'had_dereferences' => $iteration > 0,
|
|
]);
|
|
return $response;
|
|
}
|
|
|
|
$fetchCall = null;
|
|
foreach ($toolCalls as $call) {
|
|
if (($call['name'] ?? null) === 'fetch_message') {
|
|
$fetchCall = $call;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($fetchCall === null) {
|
|
$this->log('execute_complete', [
|
|
'iterations' => $iteration,
|
|
'had_dereferences' => $iteration > 0,
|
|
'other_tool_calls' => count($toolCalls),
|
|
]);
|
|
return $response;
|
|
}
|
|
|
|
$md5 = $fetchCall['arguments']['md5'] ?? null;
|
|
|
|
if ($md5 === null) {
|
|
$this->log('dereference_error', ['reason' => 'missing_md5']);
|
|
return $response;
|
|
}
|
|
|
|
$this->log('dereference_start', [
|
|
'md5' => $md5,
|
|
'iteration' => $iteration + 1,
|
|
]);
|
|
|
|
$fullMessage = $this->dereference($md5);
|
|
|
|
if ($fullMessage === null) {
|
|
$this->log('dereference_error', [
|
|
'md5' => $md5,
|
|
'reason' => 'message_not_found',
|
|
]);
|
|
return $response;
|
|
}
|
|
|
|
$fullContent = $fullMessage['content'] ?? '';
|
|
$fullTokens = $this->tokenCounter->count($fullContent);
|
|
|
|
$payload['messages'] = $this->injectDereferenced($payload['messages'], $md5, $fullMessage);
|
|
|
|
$payload['messages'][] = [
|
|
'role' => 'tool',
|
|
'content' => json_encode([
|
|
'status' => 'success',
|
|
'message' => 'Full message retrieved and injected into context.',
|
|
]),
|
|
'tool_call_id' => $fetchCall['id'],
|
|
];
|
|
|
|
$this->log('dereference_success', [
|
|
'md5' => $md5,
|
|
'role' => $fullMessage['role'] ?? 'unknown',
|
|
'content_chars' => is_string($fullContent) ? strlen($fullContent) : 0,
|
|
'content_tokens' => $fullTokens,
|
|
'new_message_count' => count($payload['messages']),
|
|
]);
|
|
|
|
$iteration++;
|
|
}
|
|
|
|
$this->log('execute_error', ['reason' => 'max_iterations_reached', 'iterations' => $iteration]);
|
|
|
|
return $response ?? new \GuzzleHttp\Psr7\Response(
|
|
500,
|
|
['Content-Type' => 'application/json'],
|
|
json_encode(['error' => ['message' => 'Max dereference iterations reached']])
|
|
);
|
|
}
|
|
|
|
// -----------------------------------------------------------------
|
|
// PRIVATE: Loop 2 helpers
|
|
// -----------------------------------------------------------------
|
|
|
|
private function extractOptions(ServerRequestInterface $request): array
|
|
{
|
|
$body = $request->getParsedBody();
|
|
$options = $body;
|
|
unset($options['messages']);
|
|
return $options;
|
|
}
|
|
|
|
/**
|
|
* Store original messages keyed by MD5 hash.
|
|
*/
|
|
private function storeOriginals(array $messages): void
|
|
{
|
|
foreach ($messages as $message) {
|
|
$content = $message['content'] ?? '';
|
|
if (is_string($content)) {
|
|
$md5 = md5($content);
|
|
$this->messageStore->set("msg:{$md5}", $message);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Summarize messages until we fit the budget.
|
|
*/
|
|
private function summarizeToFit(array $messages, int $budget, int $originalTokens): array
|
|
{
|
|
$lastIndex = count($messages) - 1;
|
|
$summarizedCount = 0;
|
|
|
|
while ($this->countTokens($messages) > $budget) {
|
|
$summarizedIndex = null;
|
|
|
|
for ($i = 0; $i < $lastIndex; $i++) {
|
|
if (!$this->isSummarized($messages[$i])) {
|
|
$summarizedIndex = $i;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($summarizedIndex === null) {
|
|
$this->log('fit_error', [
|
|
'reason' => 'all_messages_summarized',
|
|
'current_tokens' => $this->countTokens($messages),
|
|
'budget' => $budget,
|
|
]);
|
|
throw new \RuntimeException(
|
|
'Context still over budget after all messages summarized. ' .
|
|
'Last message is too large.'
|
|
);
|
|
}
|
|
|
|
$original = $messages[$summarizedIndex];
|
|
$originalContent = $original['content'] ?? '';
|
|
$originalLen = is_string($originalContent) ? strlen($originalContent) : 0;
|
|
$originalMsgTokens = $this->tokenCounter->count($originalContent);
|
|
|
|
$messages[$summarizedIndex] = $this->summarizeMessage($messages[$summarizedIndex]);
|
|
$summarizedCount++;
|
|
|
|
$summaryContent = $messages[$summarizedIndex]['content'];
|
|
$summaryMsgTokens = $this->tokenCounter->count($summaryContent);
|
|
$currentTokens = $this->countTokens($messages);
|
|
|
|
$this->log('summarize', [
|
|
'index' => $summarizedIndex,
|
|
'role' => $original['role'] ?? 'unknown',
|
|
'original_chars' => $originalLen,
|
|
'original_tokens' => $originalMsgTokens,
|
|
'summary_tokens' => $summaryMsgTokens,
|
|
'tokens_saved' => $originalMsgTokens - $summaryMsgTokens,
|
|
'running_total_tokens' => $currentTokens,
|
|
'budget' => $budget,
|
|
'md5' => $messages[$summarizedIndex]['_original_md5'] ?? null,
|
|
]);
|
|
}
|
|
|
|
$this->log('fit_summarized', [
|
|
'total_summarized' => $summarizedCount,
|
|
'original_tokens' => $originalTokens,
|
|
'final_tokens' => $this->countTokens($messages),
|
|
]);
|
|
|
|
return $messages;
|
|
}
|
|
|
|
/**
|
|
* Summarize a single message.
|
|
*/
|
|
private function summarizeMessage(array $message): array
|
|
{
|
|
$content = $message['content'] ?? '';
|
|
$md5 = is_string($content) ? md5($content) : md5(json_encode($content));
|
|
|
|
// Check cache first
|
|
$cacheKey = "summary:{$md5}";
|
|
$summary = $this->summaryCache->get($cacheKey);
|
|
|
|
if ($summary === null) {
|
|
$summary = $this->generateSummary($content);
|
|
$this->summaryCache->set($cacheKey, $summary);
|
|
}
|
|
|
|
return [
|
|
'role' => $message['role'] ?? 'user',
|
|
'content' => "[md5:{$md5}] {$summary}",
|
|
'_summarized' => true,
|
|
'_original_md5' => $md5,
|
|
];
|
|
}
|
|
|
|
private function isSummarized(array $message): bool
|
|
{
|
|
return isset($message['_summarized']) && $message['_summarized'] === true;
|
|
}
|
|
|
|
private function countTokens(array $messages): int
|
|
{
|
|
return $this->tokenCounter->contextSize($messages);
|
|
}
|
|
|
|
/**
|
|
* Generate a summary for a message.
|
|
*/
|
|
private function generateSummary(string $content): string
|
|
{
|
|
if ($this->summarizer !== null) {
|
|
return $this->summarizer->summarize($content);
|
|
}
|
|
|
|
if (strlen($content) > 100) {
|
|
return substr($content, 0, 100) . '...';
|
|
}
|
|
return $content;
|
|
}
|
|
|
|
// -----------------------------------------------------------------
|
|
// PRIVATE: Loop 3 helpers
|
|
// -----------------------------------------------------------------
|
|
|
|
/**
|
|
* Log a memory dump of the current context state.
|
|
* Like dumping CPU registers each cycle - shows what the model "sees".
|
|
*/
|
|
private function logMemoryDump(array $messages, int $iteration): void
|
|
{
|
|
$summarized = 0;
|
|
$original = 0;
|
|
$messageSummary = [];
|
|
|
|
foreach ($messages as $i => $msg) {
|
|
$isSummarized = isset($msg['_summarized']) && $msg['_summarized'] === true;
|
|
if ($isSummarized) {
|
|
$summarized++;
|
|
} else {
|
|
$original++;
|
|
}
|
|
|
|
$content = $msg['content'] ?? '';
|
|
$preview = is_string($content)
|
|
? (strlen($content) > 80 ? substr($content, 0, 80) . '...' : $content)
|
|
: '(non-string content)';
|
|
|
|
$messageSummary[] = [
|
|
'idx' => $i,
|
|
'role' => $msg['role'] ?? 'unknown',
|
|
'summarized' => $isSummarized,
|
|
'md5' => $msg['_original_md5'] ?? null,
|
|
'tokens' => $this->tokenCounter->count($content),
|
|
'preview' => $preview,
|
|
];
|
|
}
|
|
|
|
$totalTokens = $this->countTokens($messages);
|
|
|
|
$this->log('memory_dump', [
|
|
'iteration' => $iteration,
|
|
'total_messages' => count($messages),
|
|
'summarized_count' => $summarized,
|
|
'original_count' => $original,
|
|
'total_tokens' => $totalTokens,
|
|
'budget' => $this->maxContextTokens - $this->responseReserve - $this->safetyMargin,
|
|
'messages' => $messageSummary,
|
|
]);
|
|
}
|
|
|
|
/**
|
|
* Dereference an MD5 hash to get the original message.
|
|
*/
|
|
private function dereference(string $md5): ?array
|
|
{
|
|
return $this->messageStore->get("msg:{$md5}");
|
|
}
|
|
|
|
/**
|
|
* Replace a summarized message with the full message.
|
|
*/
|
|
private function injectDereferenced(array $messages, string $md5, array $fullMessage): array
|
|
{
|
|
foreach ($messages as $i => $message) {
|
|
if (($message['_original_md5'] ?? null) === $md5) {
|
|
$messages[$i] = $fullMessage;
|
|
break;
|
|
}
|
|
}
|
|
return $messages;
|
|
}
|
|
|
|
// -----------------------------------------------------------------
|
|
// Configuration
|
|
// -----------------------------------------------------------------
|
|
|
|
public function setMaxContextTokens(int $tokens): self
|
|
{
|
|
$this->maxContextTokens = $tokens;
|
|
return $this;
|
|
}
|
|
|
|
public function setResponseReserve(int $tokens): self
|
|
{
|
|
$this->responseReserve = $tokens;
|
|
return $this;
|
|
}
|
|
|
|
public function getToolFormatter(): ToolFormatter
|
|
{
|
|
return $this->toolFormatter;
|
|
}
|
|
|
|
public function getToolCallParser(): ToolCallParser
|
|
{
|
|
return $this->toolCallParser;
|
|
}
|
|
|
|
public function getMessageStore(): CacheInterface
|
|
{
|
|
return $this->messageStore;
|
|
}
|
|
|
|
public function getSummaryCache(): CacheInterface
|
|
{
|
|
return $this->summaryCache;
|
|
}
|
|
}
|