fix: move compressor position_bias into CUDA kernel (was Python loop)

The compressor_reduce.cu kernel now adds position_bias to BOTH kv and gate values, matching the PyTorch reference. Previously the kernel only added it to gate, and a Python workaround loop was adding it to both before the kernel call (then passing None to the kernel). Changes: - compressor_reduce.cu: add position_bias to kv_val in pass 2 (CSA + HCA) - single_shot_inference.py: remove Python position_bias loop, pass self.ape directly to csa/hca_compress_production - production_compress.py: already supports position_bias passthrough
2026-06-01 05:54:44 +00:00
parent 311fae490f
commit 84ca520bfb
2 changed files with 18 additions and 16 deletions
--- a/dsv4/kernels/cuda/compressor_reduce.cu
+++ b/dsv4/kernels/cuda/compressor_reduce.cu
@@ -123,15 +123,21 @@ __global__ void csa_compress_reduce_kernel(
            if (token_idx < 0 || token_idx >= T) continue;

            float g = gate_proj[token_idx * kv_dim + gate_offset + c];
+            float kv_val = kv_proj[token_idx * kv_dim + kv_offset + c];
+            // Position bias: same (m, 2*hd) bias added to every block
+            // Added to BOTH gate (softmax logit) and kv (content) per reference
            if (position_bias != nullptr) {
                int pos_bias_row = (block_i > 0 && t < m) ? t : (block_i > 0 ? (t - m) : t);
                if (pos_bias_row >= 0 && pos_bias_row < m) {
-                    g += position_bias[pos_bias_row * kv_dim + gate_offset + c];
+                    float pb = position_bias[pos_bias_row * kv_dim + gate_offset + c];
+                    g += pb;
+                    // kv_offset matches gate_offset for CSA: both are 0 (a-stream) or hd (b-stream)
+                    kv_val += position_bias[pos_bias_row * kv_dim + kv_offset + c];
                }
            }
            float e = expf(g - local_max[ci]);
            local_denom[ci] += e;
-            local_acc[ci] += e * kv_proj[token_idx * kv_dim + kv_offset + c];
+            local_acc[ci] += e * kv_val;
        }

        float val = (local_denom[ci] > 0.0f) ? (local_acc[ci] / local_denom[ci]) : 0.0f;
@@ -185,12 +191,17 @@ __global__ void hca_compress_reduce_kernel(
            int token_idx = start + t;
            if (token_idx >= T) break;
            float g = gate_proj[token_idx * hd + c];
+            float kv_val = kv_proj[token_idx * hd + c];
+            // Position bias: same (m, hd) bias added to every block
+            // Added to BOTH gate (softmax logit) and kv (content) per reference
            if (position_bias != nullptr && t < m) {
-                g += position_bias[t * hd + c];
+                float pb = position_bias[t * hd + c];
+                g += pb;
+                kv_val += pb;
            }
            float e = expf(g - local_max);
            local_denom += e;
-            local_acc += e * kv_proj[token_idx * hd + c];
+            local_acc += e * kv_val;
        }

        float val = (local_denom > 0.0f) ? (local_acc / local_denom) : 0.0f;
--- a/single_shot_inference.py
+++ b/single_shot_inference.py
@@ -205,24 +205,15 @@ class Compressor:
        kv = self.kv_lin(hidden_states).float()    # (T, kv_dim) FP32
        gate = self.gate_lin(hidden_states).float()  # (T, kv_dim) FP32

-        # Add position bias if present
-        if self.ape is not None:
-            ape = self.ape.float().to(dev)
-            n_full = T // r
-            for bi in range(n_full):
-                s, e = bi * r, (bi + 1) * r
-                # Position bias is (r, kv_dim) — cyclic per block
-                kv[s:e] += ape[:r]
-                gate[s:e] += ape[:r]
-
+        # Position bias is handled inside the CUDA kernel (added to both kv and gate)
        # Step 3: CUDA softmax/reduce kernel
        from dsv4.kernels.compressor.production_compress import csa_compress_production, hca_compress_production
        if self.is_csa:
            compressed = csa_compress_production(
-                kv, gate, None, self.kv_norm_w, m=r)
+                kv, gate, self.ape, self.kv_norm_w, m=r)
        else:
            compressed = hca_compress_production(
-                kv, gate, None, self.kv_norm_w, m=r)
+                kv, gate, self.ape, self.kv_norm_w, m=r)

        if compressed.shape[0] == 0: return None, None, None
        comp_pos = torch.tensor([positions[(bi+1)*r - 1].item() if positions.numel() > (bi+1)*r - 1 else 0