diff --git a/tests/unit/test_tmem_4warp_read.cu b/tests/unit/test_tmem_4warp_read.cu index 66afc58e..c624efd3 100644 --- a/tests/unit/test_tmem_4warp_read.cu +++ b/tests/unit/test_tmem_4warp_read.cu @@ -68,9 +68,9 @@ test_16x256b_loads(float* results) { // Read column 0 — lane 0 should get rows 0-3, lane 1 should get rows 4-7, etc. { float v0, v1, v2, v3; - asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];" + asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb), "r"(0)); // column 0 + : "r"(tb + 0)); // column 0 asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++; @@ -96,9 +96,9 @@ test_16x256b_loads(float* results) { // Read column 1 (2nd 16x256b.x1 load — does it crash?) { float v0, v1, v2, v3; - asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];" + asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb), "r"(1)); // column 1 + : "r"(tb + 1)); // column 1 asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++; @@ -113,9 +113,9 @@ test_16x256b_loads(float* results) { // Read column 8 (8th column — more 16x256b.x1 loads) { float v0, v1, v2, v3; - asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];" + asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb), "r"(8)); + : "r"(tb + 8)); asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++;