D1.3: Initialize tOrP0 before conditional for CuTeDSL scoping

This commit is contained in:
2026-05-23 21:03:53 +00:00
parent 17109a8f04
commit 399a241e84

View File

@@ -181,10 +181,11 @@ class FmhaKernel:
# For SMEM-P, offset is 0 (P not in TMEM).
# Must be defined unconditionally (CuTeDSL scoping).
_p0_bf16_offset = max(self.tmem_p0_offset, 0) * (32 // 16) # Python int
# Must define tOrP0 before conditional (CuTeDSL scoping rule).
# Initialize to tOrP, then override with offset for TMEM-P.
tOrP0 = tOrP
if _p0_bf16_offset > 0:
tOrP0 = cute.make_tensor(tOrP.iterator + _p0_bf16_offset, tOrP.layout)
else:
tOrP0 = tOrP
tCtO_fake = pv_mma.make_fragment_C(cute.append(pv_as, self.num_acc_stage))
pipeline.pipeline_init_wait(cluster_shape_mn=cl_vmnk)