From 84ca520bfb10557a8ed17c35ac02e7111c62347f Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Mon, 1 Jun 2026 05:54:44 +0000
Subject: [PATCH] fix: move compressor position_bias into CUDA kernel (was
 Python loop)

The compressor_reduce.cu kernel now adds position_bias to BOTH kv and
gate values, matching the PyTorch reference. Previously the kernel only
added it to gate, and a Python workaround loop was adding it to both
before the kernel call (then passing None to the kernel).

Changes:
- compressor_reduce.cu: add position_bias to kv_val in pass 2 (CSA + HCA)
- single_shot_inference.py: remove Python position_bias loop, pass
  self.ape directly to csa/hca_compress_production
- production_compress.py: already supports position_bias passthrough
---
 dsv4/kernels/cuda/compressor_reduce.cu | 19 +++++++++++++++----
 single_shot_inference.py               | 15 +++------------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/dsv4/kernels/cuda/compressor_reduce.cu b/dsv4/kernels/cuda/compressor_reduce.cu
index 3d7344ee..a6e07caf 100644
--- a/dsv4/kernels/cuda/compressor_reduce.cu
+++ b/dsv4/kernels/cuda/compressor_reduce.cu
@@ -123,15 +123,21 @@ __global__ void csa_compress_reduce_kernel(
             if (token_idx < 0 || token_idx >= T) continue;
 
             float g = gate_proj[token_idx * kv_dim + gate_offset + c];
+            float kv_val = kv_proj[token_idx * kv_dim + kv_offset + c];
+            // Position bias: same (m, 2*hd) bias added to every block
+            // Added to BOTH gate (softmax logit) and kv (content) per reference
             if (position_bias != nullptr) {
                 int pos_bias_row = (block_i > 0 && t < m) ? t : (block_i > 0 ? (t - m) : t);
                 if (pos_bias_row >= 0 && pos_bias_row < m) {
-                    g += position_bias[pos_bias_row * kv_dim + gate_offset + c];
+                    float pb = position_bias[pos_bias_row * kv_dim + gate_offset + c];
+                    g += pb;
+                    // kv_offset matches gate_offset for CSA: both are 0 (a-stream) or hd (b-stream)
+                    kv_val += position_bias[pos_bias_row * kv_dim + kv_offset + c];
                 }
             }
             float e = expf(g - local_max[ci]);
             local_denom[ci] += e;
-            local_acc[ci] += e * kv_proj[token_idx * kv_dim + kv_offset + c];
+            local_acc[ci] += e * kv_val;
         }
 
         float val = (local_denom[ci] > 0.0f) ? (local_acc[ci] / local_denom[ci]) : 0.0f;
@@ -185,12 +191,17 @@ __global__ void hca_compress_reduce_kernel(
             int token_idx = start + t;
             if (token_idx >= T) break;
             float g = gate_proj[token_idx * hd + c];
+            float kv_val = kv_proj[token_idx * hd + c];
+            // Position bias: same (m, hd) bias added to every block
+            // Added to BOTH gate (softmax logit) and kv (content) per reference
             if (position_bias != nullptr && t < m) {
-                g += position_bias[t * hd + c];
+                float pb = position_bias[t * hd + c];
+                g += pb;
+                kv_val += pb;
             }
             float e = expf(g - local_max);
             local_denom += e;
-            local_acc += e * kv_proj[token_idx * hd + c];
+            local_acc += e * kv_val;
         }
 
         float val = (local_denom > 0.0f) ? (local_acc / local_denom) : 0.0f;
diff --git a/single_shot_inference.py b/single_shot_inference.py
index a4cf7966..a4ab3665 100644
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -205,24 +205,15 @@ class Compressor:
         kv = self.kv_lin(hidden_states).float()    # (T, kv_dim) FP32
         gate = self.gate_lin(hidden_states).float()  # (T, kv_dim) FP32
 
-        # Add position bias if present
-        if self.ape is not None:
-            ape = self.ape.float().to(dev)
-            n_full = T // r
-            for bi in range(n_full):
-                s, e = bi * r, (bi + 1) * r
-                # Position bias is (r, kv_dim) — cyclic per block
-                kv[s:e] += ape[:r]
-                gate[s:e] += ape[:r]
-
+        # Position bias is handled inside the CUDA kernel (added to both kv and gate)
         # Step 3: CUDA softmax/reduce kernel
         from dsv4.kernels.compressor.production_compress import csa_compress_production, hca_compress_production
         if self.is_csa:
             compressed = csa_compress_production(
-                kv, gate, None, self.kv_norm_w, m=r)
+                kv, gate, self.ape, self.kv_norm_w, m=r)
         else:
             compressed = hca_compress_production(
-                kv, gate, None, self.kv_norm_w, m=r)
+                kv, gate, self.ape, self.kv_norm_w, m=r)
 
         if compressed.shape[0] == 0: return None, None, None
         comp_pos = torch.tensor([positions[(bi+1)*r - 1].item() if positions.numel() > (bi+1)*r - 1 else 0