fix: cvta.shared.u32 (not cvta.to.shared)

This commit is contained in:
2026-05-28 06:36:50 +00:00
parent 771799e112
commit cf264bd0e2

View File

@@ -30,7 +30,7 @@ fmha_decode_tmem(
float* sRowSums = (float*)(sbuf + HD*sizeof(float));
// Use remaining SMEM for TMEM allocation (tcgen05.alloc maps it)
uint32_t tmem_smem_ptr = 0;
asm("cvta.to.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf));
asm("cvta.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf));
// TMEM column count: each tcgen05.ld reads 4 FP32 per column (16 rows × 256 bits)
// For T=1 decode, we only use row-group 0 (16 rows). Each column holds 4 FP32 values.