From d2bbdd59f63e9db71a5031cc90287d76acdf83f9 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 22 May 2026 17:51:25 +0000 Subject: [PATCH] =?UTF-8?q?Try=20cutlass.range=20with=20Int32(kt)=20?= =?UTF-8?q?=E2=80=94=20now=20n=5Fkv=5Ftiles=20is=20Python=20int?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_fmha_v3_stage_c_full.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_fmha_v3_stage_c_full.py b/tests/unit/test_fmha_v3_stage_c_full.py index 235b44a9..57022f2e 100644 --- a/tests/unit/test_fmha_v3_stage_c_full.py +++ b/tests/unit/test_fmha_v3_stage_c_full.py @@ -205,10 +205,9 @@ class FmhaV3StageC: cute.copy(tma_q, tAgQ[(None, Int32(0))], tAsQ[(None, qh.index)], tma_bar_ptr=qh.barrier) qp.tail() kvp.reset(); pk = kvp.try_acquire() - # Python range() unrolls at trace time. Each iteration emits a - # separate cute.copy with a distinct compile-time Int32 constant. - # We proved Int32(1) hardcoded works — by induction Int32(k) works. - for kt in range(self.n_kv_tiles): + # Use cutlass.range with Python int n_kv_tiles for proper pipeline + # semantics (acquire/release). Wrap kt in Int32() for TMA coordinate. + for kt in cutlass.range(self.n_kv_tiles, unroll=1): coord = Int32(kt) kvh = kvp.acquire_and_advance(pk) cute.copy(tma_k, tBgK[(None, coord)], tBsK[(None, kvh.index)], tma_bar_ptr=kvh.barrier)