diff --git a/tests/unit/test_tmem_4warp_read.cu b/tests/unit/test_tmem_4warp_read.cu
index 66afc58e..c624efd3 100644
--- a/tests/unit/test_tmem_4warp_read.cu
+++ b/tests/unit/test_tmem_4warp_read.cu
@@ -68,9 +68,9 @@ test_16x256b_loads(float* results) {
     // Read column 0 — lane 0 should get rows 0-3, lane 1 should get rows 4-7, etc.
     {
         float v0, v1, v2, v3;
-        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];"
+        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];"
             : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3)
-            : "r"(tb), "r"(0));  // column 0
+            : "r"(tb + 0));  // column 0
         asm volatile("tcgen05.wait::ld.sync.aligned;");
         load_count++;
 
@@ -96,9 +96,9 @@ test_16x256b_loads(float* results) {
     // Read column 1 (2nd 16x256b.x1 load — does it crash?)
     {
         float v0, v1, v2, v3;
-        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];"
+        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];"
             : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3)
-            : "r"(tb), "r"(1));  // column 1
+            : "r"(tb + 1));  // column 1
         asm volatile("tcgen05.wait::ld.sync.aligned;");
         load_count++;
 
@@ -113,9 +113,9 @@ test_16x256b_loads(float* results) {
     // Read column 8 (8th column — more 16x256b.x1 loads)
     {
         float v0, v1, v2, v3;
-        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4, %5];"
+        asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];"
             : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3)
-            : "r"(tb), "r"(8));
+            : "r"(tb + 8));
         asm volatile("tcgen05.wait::ld.sync.aligned;");
         load_count++;