fix: move compressor position_bias into CUDA kernel (was Python loop)
The compressor_reduce.cu kernel now adds position_bias to BOTH kv and gate values, matching the PyTorch reference. Previously the kernel only added it to gate, and a Python workaround loop was adding it to both before the kernel call (then passing None to the kernel). Changes: - compressor_reduce.cu: add position_bias to kv_val in pass 2 (CSA + HCA) - single_shot_inference.py: remove Python position_bias loop, pass self.ape directly to csa/hca_compress_production - production_compress.py: already supports position_bias passthrough
This commit is contained in:
@@ -123,15 +123,21 @@ __global__ void csa_compress_reduce_kernel(
|
||||
if (token_idx < 0 || token_idx >= T) continue;
|
||||
|
||||
float g = gate_proj[token_idx * kv_dim + gate_offset + c];
|
||||
float kv_val = kv_proj[token_idx * kv_dim + kv_offset + c];
|
||||
// Position bias: same (m, 2*hd) bias added to every block
|
||||
// Added to BOTH gate (softmax logit) and kv (content) per reference
|
||||
if (position_bias != nullptr) {
|
||||
int pos_bias_row = (block_i > 0 && t < m) ? t : (block_i > 0 ? (t - m) : t);
|
||||
if (pos_bias_row >= 0 && pos_bias_row < m) {
|
||||
g += position_bias[pos_bias_row * kv_dim + gate_offset + c];
|
||||
float pb = position_bias[pos_bias_row * kv_dim + gate_offset + c];
|
||||
g += pb;
|
||||
// kv_offset matches gate_offset for CSA: both are 0 (a-stream) or hd (b-stream)
|
||||
kv_val += position_bias[pos_bias_row * kv_dim + kv_offset + c];
|
||||
}
|
||||
}
|
||||
float e = expf(g - local_max[ci]);
|
||||
local_denom[ci] += e;
|
||||
local_acc[ci] += e * kv_proj[token_idx * kv_dim + kv_offset + c];
|
||||
local_acc[ci] += e * kv_val;
|
||||
}
|
||||
|
||||
float val = (local_denom[ci] > 0.0f) ? (local_acc[ci] / local_denom[ci]) : 0.0f;
|
||||
@@ -185,12 +191,17 @@ __global__ void hca_compress_reduce_kernel(
|
||||
int token_idx = start + t;
|
||||
if (token_idx >= T) break;
|
||||
float g = gate_proj[token_idx * hd + c];
|
||||
float kv_val = kv_proj[token_idx * hd + c];
|
||||
// Position bias: same (m, hd) bias added to every block
|
||||
// Added to BOTH gate (softmax logit) and kv (content) per reference
|
||||
if (position_bias != nullptr && t < m) {
|
||||
g += position_bias[t * hd + c];
|
||||
float pb = position_bias[t * hd + c];
|
||||
g += pb;
|
||||
kv_val += pb;
|
||||
}
|
||||
float e = expf(g - local_max);
|
||||
local_denom += e;
|
||||
local_acc += e * kv_proj[token_idx * hd + c];
|
||||
local_acc += e * kv_val;
|
||||
}
|
||||
|
||||
float val = (local_denom > 0.0f) ? (local_acc / local_denom) : 0.0f;
|
||||
|
||||
@@ -205,24 +205,15 @@ class Compressor:
|
||||
kv = self.kv_lin(hidden_states).float() # (T, kv_dim) FP32
|
||||
gate = self.gate_lin(hidden_states).float() # (T, kv_dim) FP32
|
||||
|
||||
# Add position bias if present
|
||||
if self.ape is not None:
|
||||
ape = self.ape.float().to(dev)
|
||||
n_full = T // r
|
||||
for bi in range(n_full):
|
||||
s, e = bi * r, (bi + 1) * r
|
||||
# Position bias is (r, kv_dim) — cyclic per block
|
||||
kv[s:e] += ape[:r]
|
||||
gate[s:e] += ape[:r]
|
||||
|
||||
# Position bias is handled inside the CUDA kernel (added to both kv and gate)
|
||||
# Step 3: CUDA softmax/reduce kernel
|
||||
from dsv4.kernels.compressor.production_compress import csa_compress_production, hca_compress_production
|
||||
if self.is_csa:
|
||||
compressed = csa_compress_production(
|
||||
kv, gate, None, self.kv_norm_w, m=r)
|
||||
kv, gate, self.ape, self.kv_norm_w, m=r)
|
||||
else:
|
||||
compressed = hca_compress_production(
|
||||
kv, gate, None, self.kv_norm_w, m=r)
|
||||
kv, gate, self.ape, self.kv_norm_w, m=r)
|
||||
|
||||
if compressed.shape[0] == 0: return None, None, None
|
||||
comp_pos = torch.tensor([positions[(bi+1)*r - 1].item() if positions.numel() > (bi+1)*r - 1 else 0
|
||||
|
||||
Reference in New Issue
Block a user