From cf264bd0e29864ce0777efc2c5b57f58b8f78f86 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 06:36:50 +0000 Subject: [PATCH] fix: cvta.shared.u32 (not cvta.to.shared) --- dsv4/kernels/attention/fmha_epilogue_sm100.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dsv4/kernels/attention/fmha_epilogue_sm100.cuh b/dsv4/kernels/attention/fmha_epilogue_sm100.cuh index 1c767cf3..733409db 100644 --- a/dsv4/kernels/attention/fmha_epilogue_sm100.cuh +++ b/dsv4/kernels/attention/fmha_epilogue_sm100.cuh @@ -30,7 +30,7 @@ fmha_decode_tmem( float* sRowSums = (float*)(sbuf + HD*sizeof(float)); // Use remaining SMEM for TMEM allocation (tcgen05.alloc maps it) uint32_t tmem_smem_ptr = 0; - asm("cvta.to.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf)); + asm("cvta.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf)); // TMEM column count: each tcgen05.ld reads 4 FP32 per column (16 rows × 256 bits) // For T=1 decode, we only use row-group 0 (16 rows). Each column holds 4 FP32 values.