Fix: use self.scale instead of self.softmax_scale in Blackwell attention path
This commit is contained in:
@@ -631,7 +631,7 @@ class DeepseekV4MultiHeadLatentAttentionWrapper(PluggableLayer):
|
||||
return
|
||||
|
||||
# Attention using PyTorch SDPA (works on Blackwell)
|
||||
o = full_sdpa_attention(q, kv, self.softmax_scale)
|
||||
o = full_sdpa_attention(q, kv, self.scale)
|
||||
|
||||
# Write into the output buffer (same shape as original path)
|
||||
if self.n_local_heads < self.padded_heads:
|
||||
|
||||
Reference in New Issue
Block a user