fix: use __cvta_generic_to_shared directly for 64-bit compat

This commit is contained in:
2026-05-28 22:56:29 +00:00
parent 1d6a95df32
commit 28e04a5ea8

View File

@@ -38,8 +38,7 @@ __global__ void test_tmem_row_offset(float* results) {
uint32_t* sTmemBase = (uint32_t*)sbuf;
// Alloc TMEM
uint32_t smem_ptr;
asm volatile("cvta.to.shared.u32 %0, %1;" : "=r"(smem_ptr) : "r"((uint32_t)__cvta_generic_to_shared(sTmemBase)));
uint32_t smem_ptr = __cvta_generic_to_shared(sTmemBase);
if (lane == 0) tmem_alloc(smem_ptr, TMEM_N);
__syncwarp();
uint32_t tb = *sTmemBase;