diff --git a/single_shot_inference.py b/single_shot_inference.py index e3c91ce6..224c634d 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -455,7 +455,8 @@ class KVCache: # P3: Pre-allocate compressed KV buffers (no more torch.cat / O(N²) growth) self.comp_kv_buf = torch.zeros(max_comp, head_dim, dtype=torch.bfloat16, device=device) self.comp_pos_buf = torch.zeros(max_comp, dtype=torch.long, device=device) - self.comp_idx_buf = torch.zeros(max_comp, head_dim, dtype=torch.bfloat16, device=device) + # Indexer compressed keys are width ihd (128), NOT head_dim (512) + self.comp_idx_buf = torch.zeros(max_comp, 128, dtype=torch.bfloat16, device=device) # PROBE: width=ihd self.n_comp = 0 self._has_idx = False