From 84ca520bfb10557a8ed17c35ac02e7111c62347f Mon Sep 17 00:00:00 2001 From: biondizzle Date: Mon, 1 Jun 2026 05:54:44 +0000 Subject: [PATCH] fix: move compressor position_bias into CUDA kernel (was Python loop) The compressor_reduce.cu kernel now adds position_bias to BOTH kv and gate values, matching the PyTorch reference. Previously the kernel only added it to gate, and a Python workaround loop was adding it to both before the kernel call (then passing None to the kernel). Changes: - compressor_reduce.cu: add position_bias to kv_val in pass 2 (CSA + HCA) - single_shot_inference.py: remove Python position_bias loop, pass self.ape directly to csa/hca_compress_production - production_compress.py: already supports position_bias passthrough --- dsv4/kernels/cuda/compressor_reduce.cu | 19 +++++++++++++++---- single_shot_inference.py | 15 +++------------ 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/dsv4/kernels/cuda/compressor_reduce.cu b/dsv4/kernels/cuda/compressor_reduce.cu index 3d7344ee..a6e07caf 100644 --- a/dsv4/kernels/cuda/compressor_reduce.cu +++ b/dsv4/kernels/cuda/compressor_reduce.cu @@ -123,15 +123,21 @@ __global__ void csa_compress_reduce_kernel( if (token_idx < 0 || token_idx >= T) continue; float g = gate_proj[token_idx * kv_dim + gate_offset + c]; + float kv_val = kv_proj[token_idx * kv_dim + kv_offset + c]; + // Position bias: same (m, 2*hd) bias added to every block + // Added to BOTH gate (softmax logit) and kv (content) per reference if (position_bias != nullptr) { int pos_bias_row = (block_i > 0 && t < m) ? t : (block_i > 0 ? (t - m) : t); if (pos_bias_row >= 0 && pos_bias_row < m) { - g += position_bias[pos_bias_row * kv_dim + gate_offset + c]; + float pb = position_bias[pos_bias_row * kv_dim + gate_offset + c]; + g += pb; + // kv_offset matches gate_offset for CSA: both are 0 (a-stream) or hd (b-stream) + kv_val += position_bias[pos_bias_row * kv_dim + kv_offset + c]; } } float e = expf(g - local_max[ci]); local_denom[ci] += e; - local_acc[ci] += e * kv_proj[token_idx * kv_dim + kv_offset + c]; + local_acc[ci] += e * kv_val; } float val = (local_denom[ci] > 0.0f) ? (local_acc[ci] / local_denom[ci]) : 0.0f; @@ -185,12 +191,17 @@ __global__ void hca_compress_reduce_kernel( int token_idx = start + t; if (token_idx >= T) break; float g = gate_proj[token_idx * hd + c]; + float kv_val = kv_proj[token_idx * hd + c]; + // Position bias: same (m, hd) bias added to every block + // Added to BOTH gate (softmax logit) and kv (content) per reference if (position_bias != nullptr && t < m) { - g += position_bias[t * hd + c]; + float pb = position_bias[t * hd + c]; + g += pb; + kv_val += pb; } float e = expf(g - local_max); local_denom += e; - local_acc += e * kv_proj[token_idx * hd + c]; + local_acc += e * kv_val; } float val = (local_denom > 0.0f) ? (local_acc / local_denom) : 0.0f; diff --git a/single_shot_inference.py b/single_shot_inference.py index a4cf7966..a4ab3665 100644 --- a/single_shot_inference.py +++ b/single_shot_inference.py @@ -205,24 +205,15 @@ class Compressor: kv = self.kv_lin(hidden_states).float() # (T, kv_dim) FP32 gate = self.gate_lin(hidden_states).float() # (T, kv_dim) FP32 - # Add position bias if present - if self.ape is not None: - ape = self.ape.float().to(dev) - n_full = T // r - for bi in range(n_full): - s, e = bi * r, (bi + 1) * r - # Position bias is (r, kv_dim) — cyclic per block - kv[s:e] += ape[:r] - gate[s:e] += ape[:r] - + # Position bias is handled inside the CUDA kernel (added to both kv and gate) # Step 3: CUDA softmax/reduce kernel from dsv4.kernels.compressor.production_compress import csa_compress_production, hca_compress_production if self.is_csa: compressed = csa_compress_production( - kv, gate, None, self.kv_norm_w, m=r) + kv, gate, self.ape, self.kv_norm_w, m=r) else: compressed = hca_compress_production( - kv, gate, None, self.kv_norm_w, m=r) + kv, gate, self.ape, self.kv_norm_w, m=r) if compressed.shape[0] == 0: return None, None, None comp_pos = torch.tensor([positions[(bi+1)*r - 1].item() if positions.numel() > (bi+1)*r - 1 else 0