"""Fixed-size block allocator for the classical paged KV cache. One BlockAllocator per layer per "pool kind" (classical / indexer). Total blocks are sized at engine startup. Blocks are recycled on request completion. Cudagraph-safety: allocation can't happen inside a captured graph (allocation rate is per-request not per-token). The contract is: - acquire() called between graph captures. - release() called between graph captures. - read access (via block table) happens INSIDE captured graphs. """ from __future__ import annotations import torch class BlockAllocator: def __init__( self, num_total_blocks: int, device: str = "cuda", ): self.num_total_blocks = num_total_blocks self.device = device # Free-list as a GPU stack: ids[0..top-1] holds free block IDs. # `top` lives in pinned host memory so we can read it without a # device sync (it's modified only between graph captures). self.free_ids = torch.arange( num_total_blocks, dtype=torch.int32, device=device, ) self.top_cpu = torch.tensor([num_total_blocks], dtype=torch.int32, pin_memory=True) @property def num_free(self) -> int: return int(self.top_cpu[0]) def acquire(self, n: int) -> torch.Tensor: """Return a tensor of `n` block IDs. Called between captures.""" top = int(self.top_cpu[0]) if n > top: raise RuntimeError( f"KV cache OOM: requested {n} blocks, {top} available " f"(of {self.num_total_blocks} total)" ) new_top = top - n ids = self.free_ids[new_top:top].clone() # snapshot self.top_cpu[0] = new_top return ids def release(self, ids: torch.Tensor) -> None: """Return blocks to the free list. Called between captures.""" n = ids.numel() top = int(self.top_cpu[0]) self.free_ids[top:top + n] = ids.to(device=self.device) self.top_cpu[0] = top + n