D1.4: Guard LSE computation with const_expr(not normalize) - fixes BF16 type mismatch in regression test

This commit is contained in:
2026-05-24 15:11:39 +00:00
parent d8e2a8f33e
commit 423d97b094

View File

@@ -127,7 +127,6 @@ class FmhaKernel:
tma_c,mC = cpasync.make_tiled_tma_atom(cpasync.CopyBulkTensorTileS2GOp(),c,epi_s,self.epi_tile)
# Always create a valid mLSE tensor for the kernel.
# CuTeDSL doesn't support None parameters in @cute.kernel.
# For normalize=True, mLSE is unused (dead-code-eliminated by compiler).
if const_expr(lse is None):
lse = cute.make_tensor(c.iterator, cute.make_layout((1,), stride=(0,)))
self._kernel(qk_mma,pv_mma,tma_q,mQ,tma_k,mK,tma_v,mV,tma_c,mC,self.cluster_layout_vmnk,self.q_smem_s,self.k_smem_s,self.v_smem_s,self.p_tmem_s,self.p_smem_s,self.c_smem_s,self.epi_tile,lse).launch(grid=(1,1,1),block=[self.threads_per_cta,1,1],stream=stream)
@@ -504,15 +503,16 @@ class FmhaKernel:
c_pipe.producer_tail()
# Compute LSE: lse = ln(row_sum) + row_max * ln(2)
# Always compute LSE (needed for external normalization).
# row_max is in scale_log2 domain, multiply by ln(2) to convert.
_row_max_safe = row_max
if row_max == -cutlass.Float32.inf:
_row_max_safe = Float32(0.0)
if sfw_idx == 0:
_ln2 = Float32(0.6931471805599453) # ln(2)
lse_val = cute.math.log(row_sum, fastmath=True) + _row_max_safe * _ln2
mLSE[0] = lse_val
# Only when emitting un-normalized output (D5a path).
# When normalize=True, LSE is not needed (in-kernel normalization).
if const_expr(not self.normalize):
_row_max_safe = row_max
if row_max == -cutlass.Float32.inf:
_row_max_safe = Float32(0.0)
if sfw_idx == 0:
_ln2 = Float32(0.6931471805599453) # ln(2)
lse_val = cute.math.log(row_sum, fastmath=True) + _row_max_safe * _ln2
mLSE[0] = lse_val
tmem.relinquish_alloc_permit()
tmem.free(tmem_ptr)