fix: move compressor position_bias into CUDA kernel (was Python loop)

The compressor_reduce.cu kernel now adds position_bias to BOTH kv and
gate values, matching the PyTorch reference. Previously the kernel only
added it to gate, and a Python workaround loop was adding it to both
before the kernel call (then passing None to the kernel).

Changes:
- compressor_reduce.cu: add position_bias to kv_val in pass 2 (CSA + HCA)
- single_shot_inference.py: remove Python position_bias loop, pass
  self.ape directly to csa/hca_compress_production
- production_compress.py: already supports position_bias passthrough
This commit is contained in:
2026-06-01 05:54:44 +00:00
parent 311fae490f
commit 84ca520bfb
2 changed files with 18 additions and 16 deletions

View File

@@ -123,15 +123,21 @@ __global__ void csa_compress_reduce_kernel(
if (token_idx < 0 || token_idx >= T) continue;
float g = gate_proj[token_idx * kv_dim + gate_offset + c];
float kv_val = kv_proj[token_idx * kv_dim + kv_offset + c];
// Position bias: same (m, 2*hd) bias added to every block
// Added to BOTH gate (softmax logit) and kv (content) per reference
if (position_bias != nullptr) {
int pos_bias_row = (block_i > 0 && t < m) ? t : (block_i > 0 ? (t - m) : t);
if (pos_bias_row >= 0 && pos_bias_row < m) {
g += position_bias[pos_bias_row * kv_dim + gate_offset + c];
float pb = position_bias[pos_bias_row * kv_dim + gate_offset + c];
g += pb;
// kv_offset matches gate_offset for CSA: both are 0 (a-stream) or hd (b-stream)
kv_val += position_bias[pos_bias_row * kv_dim + kv_offset + c];
}
}
float e = expf(g - local_max[ci]);
local_denom[ci] += e;
local_acc[ci] += e * kv_proj[token_idx * kv_dim + kv_offset + c];
local_acc[ci] += e * kv_val;
}
float val = (local_denom[ci] > 0.0f) ? (local_acc[ci] / local_denom[ci]) : 0.0f;
@@ -185,12 +191,17 @@ __global__ void hca_compress_reduce_kernel(
int token_idx = start + t;
if (token_idx >= T) break;
float g = gate_proj[token_idx * hd + c];
float kv_val = kv_proj[token_idx * hd + c];
// Position bias: same (m, hd) bias added to every block
// Added to BOTH gate (softmax logit) and kv (content) per reference
if (position_bias != nullptr && t < m) {
g += position_bias[t * hd + c];
float pb = position_bias[t * hd + c];
g += pb;
kv_val += pb;
}
float e = expf(g - local_max);
local_denom += e;
local_acc += e * kv_proj[token_idx * hd + c];
local_acc += e * kv_val;
}
float val = (local_denom > 0.0f) ? (local_acc / local_denom) : 0.0f;

View File

@@ -205,24 +205,15 @@ class Compressor:
kv = self.kv_lin(hidden_states).float() # (T, kv_dim) FP32
gate = self.gate_lin(hidden_states).float() # (T, kv_dim) FP32
# Add position bias if present
if self.ape is not None:
ape = self.ape.float().to(dev)
n_full = T // r
for bi in range(n_full):
s, e = bi * r, (bi + 1) * r
# Position bias is (r, kv_dim) — cyclic per block
kv[s:e] += ape[:r]
gate[s:e] += ape[:r]
# Position bias is handled inside the CUDA kernel (added to both kv and gate)
# Step 3: CUDA softmax/reduce kernel
from dsv4.kernels.compressor.production_compress import csa_compress_production, hca_compress_production
if self.is_csa:
compressed = csa_compress_production(
kv, gate, None, self.kv_norm_w, m=r)
kv, gate, self.ape, self.kv_norm_w, m=r)
else:
compressed = hca_compress_production(
kv, gate, None, self.kv_norm_w, m=r)
kv, gate, self.ape, self.kv_norm_w, m=r)
if compressed.shape[0] == 0: return None, None, None
comp_pos = torch.tensor([positions[(bi+1)*r - 1].item() if positions.numel() > (bi+1)*r - 1 else 0