D1.4: Guard LSE computation with const_expr(not normalize) - fixes BF16 type mismatch in regression test

2026-05-24 15:11:39 +00:00
parent d8e2a8f33e
commit 423d97b094
1 changed files with 10 additions and 10 deletions
--- a/dsv4/kernels/attention/fmha.py
+++ b/dsv4/kernels/attention/fmha.py
@@ -127,7 +127,6 @@ class FmhaKernel:
        tma_c,mC = cpasync.make_tiled_tma_atom(cpasync.CopyBulkTensorTileS2GOp(),c,epi_s,self.epi_tile)
        # Always create a valid mLSE tensor for the kernel.
        # CuTeDSL doesn't support None parameters in @cute.kernel.
-        # For normalize=True, mLSE is unused (dead-code-eliminated by compiler).
        if const_expr(lse is None):
            lse = cute.make_tensor(c.iterator, cute.make_layout((1,), stride=(0,)))
        self._kernel(qk_mma,pv_mma,tma_q,mQ,tma_k,mK,tma_v,mV,tma_c,mC,self.cluster_layout_vmnk,self.q_smem_s,self.k_smem_s,self.v_smem_s,self.p_tmem_s,self.p_smem_s,self.c_smem_s,self.epi_tile,lse).launch(grid=(1,1,1),block=[self.threads_per_cta,1,1],stream=stream)
@@ -504,15 +503,16 @@ class FmhaKernel:
            c_pipe.producer_tail()

            # Compute LSE: lse = ln(row_sum) + row_max * ln(2)
-            # Always compute LSE (needed for external normalization).
-            # row_max is in scale_log2 domain, multiply by ln(2) to convert.
-            _row_max_safe = row_max
-            if row_max == -cutlass.Float32.inf:
-                _row_max_safe = Float32(0.0)
-            if sfw_idx == 0:
-                _ln2 = Float32(0.6931471805599453)  # ln(2)
-                lse_val = cute.math.log(row_sum, fastmath=True) + _row_max_safe * _ln2
-                mLSE[0] = lse_val
+            # Only when emitting un-normalized output (D5a path).
+            # When normalize=True, LSE is not needed (in-kernel normalization).
+            if const_expr(not self.normalize):
+                _row_max_safe = row_max
+                if row_max == -cutlass.Float32.inf:
+                    _row_max_safe = Float32(0.0)
+                if sfw_idx == 0:
+                    _ln2 = Float32(0.6931471805599453)  # ln(2)
+                    lse_val = cute.math.log(row_sum, fastmath=True) + _row_max_safe * _ln2
+                    mLSE[0] = lse_val

            tmem.relinquish_alloc_permit()
            tmem.free(tmem_ptr)