From aba8f111d54e3bb8143cdbe654917d0ac573621f Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 23 May 2026 21:13:07 +0000 Subject: [PATCH] D5a: Use cute.store for LSE write --- dsv4/kernels/attention/fmha.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 2eeb3eea..1cdab662 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -446,16 +446,14 @@ class FmhaKernel: # D5a: Write LSE (log-softmax) when normalize=False # lse = log(row_sum) + row_max_safe (row_max in scaled domain) - # row_max_safe = row_max if row_max != -inf else 0 # Only thread 0 of the epilogue warps writes LSE for this tile. if const_expr(not self.normalize): - # Compute row_max_safe from the final row_max _row_max_safe = row_max if row_max == -cutlass.Float32.inf: _row_max_safe = Float32(0.0) if sfw_idx == 0: lse_val = cute.math.log(row_sum, fastmath=True) + _row_max_safe - mLSE[None, None, 0] = lse_val + cute.store(lse_val, mLSE[None, None, 0]) tmem.relinquish_alloc_permit() tmem.free(tmem_ptr)