From 2bb3eb95ede760ffc6c255ab12ba78406fb86190 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Sat, 23 May 2026 21:03:00 +0000 Subject: [PATCH] D1.3: Fix tOrP0 for SMEM-P - skip make_tensor when offset is 0 CuTeDSL doesn't support OpResult + int. When offset is 0 (SMEM-P), just use tOrP directly. --- dsv4/kernels/attention/fmha.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dsv4/kernels/attention/fmha.py b/dsv4/kernels/attention/fmha.py index 36140616..93351c85 100644 --- a/dsv4/kernels/attention/fmha.py +++ b/dsv4/kernels/attention/fmha.py @@ -181,7 +181,10 @@ class FmhaKernel: # For SMEM-P, offset is 0 (P not in TMEM). # Must be defined unconditionally (CuTeDSL scoping). _p0_bf16_offset = max(self.tmem_p0_offset, 0) * (32 // 16) # Python int - tOrP0 = cute.make_tensor(tOrP.iterator + _p0_bf16_offset, tOrP.layout) + if _p0_bf16_offset > 0: + tOrP0 = cute.make_tensor(tOrP.iterator + _p0_bf16_offset, tOrP.layout) + else: + tOrP0 = tOrP tCtO_fake = pv_mma.make_fragment_C(cute.append(pv_as, self.num_acc_stage)) pipeline.pipeline_init_wait(cluster_shape_mn=cl_vmnk)