fix: cvta.shared.u32 (not cvta.to.shared)
This commit is contained in:
@@ -30,7 +30,7 @@ fmha_decode_tmem(
|
||||
float* sRowSums = (float*)(sbuf + HD*sizeof(float));
|
||||
// Use remaining SMEM for TMEM allocation (tcgen05.alloc maps it)
|
||||
uint32_t tmem_smem_ptr = 0;
|
||||
asm("cvta.to.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf));
|
||||
asm("cvta.shared.u32 %0, %1;" : "=r"(tmem_smem_ptr) : "l"(sbuf));
|
||||
|
||||
// TMEM column count: each tcgen05.ld reads 4 FP32 per column (16 rows × 256 bits)
|
||||
// For T=1 decode, we only use row-group 0 (16 rows). Each column holds 4 FP32 values.
|
||||
|
||||
Reference in New Issue
Block a user