Fix: use self.scale instead of self.softmax_scale in Blackwell attention path

This commit is contained in:
2026-05-19 10:04:46 +00:00
parent 39310c357d
commit 7e97551fd3

View File

@@ -631,7 +631,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
return
# Attention using PyTorch SDPA (works on Blackwell)
o = full_sdpa_attention(q, kv, self.softmax_scale)
o = full_sdpa_attention(q, kv, self.scale)
# Write into the output buffer (same shape as original path)
if self.n_local_heads < self.padded_heads: