diff --git a/single_shot_inference.py b/single_shot_inference.py index 608d81a1..17bbab3d 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -313,7 +313,9 @@ class Compressor: self.kv_norm_w = w.get(f"{pfx}.kv_norm.weight") def forward(self, hidden_states, positions): - if self.ratio == 0 or self.kv_lin is None: return None, None, None + if self.ratio == 0 or self.kv_lin is None: + print(f" COMPRESSOR EARLY RETURN: ratio={self.ratio} kv_lin={self.kv_lin is not None} hd={self.hd} kv_dim={self.kv_dim}", flush=True) + return None, None, None T = hidden_states.shape[0]; r = self.ratio; dev = hidden_states.device # P7: Buffer decode steps until we have a complete block. @@ -330,6 +332,7 @@ class Compressor: self._pos_buffer[self._buf_len] = positions[0] if positions.numel() == 1 else positions[self._buf_len] self._buf_len += 1 if self._buf_len < r: + print(f" COMPRESSOR BUFFERING: hd={self.hd} buf_len={self._buf_len} r={r}", flush=True) return None, None, None # Not enough tokens yet # We have a full buffer — use it hidden_states = self._hs_buffer[:self._buf_len]