From 95003eced216d9bef73f5cd3fd9bc47572d914bc Mon Sep 17 00:00:00 2001 From: biondizzle Date: Thu, 28 May 2026 23:03:10 +0000 Subject: [PATCH] test: 16x256b.x1 loads with uint32_t regs, matching working pattern --- tests/unit/test_tmem_4warp_read.cu | 118 ++++++++++------------------- 1 file changed, 41 insertions(+), 77 deletions(-) diff --git a/tests/unit/test_tmem_4warp_read.cu b/tests/unit/test_tmem_4warp_read.cu index c624efd3..6b4333c5 100644 --- a/tests/unit/test_tmem_4warp_read.cu +++ b/tests/unit/test_tmem_4warp_read.cu @@ -1,10 +1,6 @@ /** * Test: Can we do 16x256b.x1 LOADS multiple times without crashing? - * (The crash was on 16x256b.x1 STORES, not loads.) - * - * If loads work multiple times, we can: - * - Use 16x256b.x1 for softmax reads (128 rows, 1 column per call) - * - Use 32x32b.x8 for everything else (stores, PV, epilogue) + * Uses uint32_t registers (not float) per the working test_tmem_minimal.cu pattern. */ #include @@ -40,96 +36,70 @@ test_16x256b_loads(float* results) { __syncwarp(); uint32_t tb = *sTmemBase; - // Write data via 32x32b.x8 (known working for stores) + // Write data via 32x32b.x8 (known working) { float vals[8]; for (int c = 0; c < 8; c++) vals[c] = (float)(lane * 10 + c); uint32_t ivals[8]; for (int c = 0; c < 8; c++) memcpy(&ivals[c], &vals[c], 4); - // Write to column groups 0-3 (32 columns) - for (int n = 0; n < 4; n++) { - asm volatile("tcgen05.st.sync.aligned.32x32b.x8.b32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};" - :: "r"(tb + n * 8), - "r"(ivals[0]), "r"(ivals[1]), "r"(ivals[2]), "r"(ivals[3]), - "r"(ivals[4]), "r"(ivals[5]), "r"(ivals[6]), "r"(ivals[7])); - asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory"); - // Update vals for next column group - for (int c = 0; c < 8; c++) vals[c] = (float)(lane * 10 + n * 8 + c); - for (int c = 0; c < 8; c++) memcpy(&ivals[c], &vals[c], 4); - } + asm volatile("tcgen05.st.sync.aligned.32x32b.x8.b32 [%0], {%1,%2,%3,%4,%5,%6,%7,%8};" + :: "r"(tb + 0), + "r"(ivals[0]), "r"(ivals[1]), "r"(ivals[2]), "r"(ivals[3]), + "r"(ivals[4]), "r"(ivals[5]), "r"(ivals[6]), "r"(ivals[7])); + asm volatile("tcgen05.wait::st.sync.aligned;" ::: "memory"); } - // Now try reading with 16x256b.x1 loads - // 16x256b.x1: lane l reads 4 FP32 values (rows l*4+0..3) from 1 column + // 16x256b.x1 loads using uint32_t registers (matching test_tmem_minimal.cu) int load_count = 0; - int pass = 1; - // Read column 0 — lane 0 should get rows 0-3, lane 1 should get rows 4-7, etc. + // Load column 0 { - float v0, v1, v2, v3; + uint32_t u0, u1, u2, u3; asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" - : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb + 0)); // column 0 + : "=r"(u0), "=r"(u1), "=r"(u2), "=r"(u3) : "r"(tb + 0)); asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++; - // Lane l should see values for rows l*4+0..3 - // From the store: row 0 col 0 = 0.0, row 1 col 0 = 10.0, row 2 col 0 = 20.0, ... - // Wait — the 32x32b.x8 store wrote lane l's data to "row l" in TMEM. - // So row l, col 0 = l*10 + 0 = l*10 - // Lane 0 reads rows 0-3: v0=row0=0, v1=row1=10, v2=row2=20, v3=row3=30 - if (lane == 0) { - results[0] = v0; - results[1] = v1; - results[2] = v2; - results[3] = v3; - } - if (lane == 1) { - results[4] = v0; - results[5] = v1; - results[6] = v2; - results[7] = v3; - } + float v0, v1, v2, v3; + memcpy(&v0, &u0, 4); memcpy(&v1, &u1, 4); + memcpy(&v2, &u2, 4); memcpy(&v3, &u3, 4); + + if (lane == 0) { results[0]=v0; results[1]=v1; results[2]=v2; results[3]=v3; } + if (lane == 1) { results[4]=v0; results[5]=v1; results[6]=v2; results[7]=v3; } } - // Read column 1 (2nd 16x256b.x1 load — does it crash?) + // Load column 1 (2nd load — does it crash?) { - float v0, v1, v2, v3; + uint32_t u0, u1, u2, u3; asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" - : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb + 1)); // column 1 + : "=r"(u0), "=r"(u1), "=r"(u2), "=r"(u3) : "r"(tb + 1)); asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++; - if (lane == 0) { - results[8] = v0; - results[9] = v1; - results[10] = v2; - results[11] = v3; - } + float v0, v1, v2, v3; + memcpy(&v0, &u0, 4); memcpy(&v1, &u1, 4); + memcpy(&v2, &u2, 4); memcpy(&v3, &u3, 4); + + if (lane == 0) { results[8]=v0; results[9]=v1; results[10]=v2; results[11]=v3; } } - // Read column 8 (8th column — more 16x256b.x1 loads) + // Load column 8 (3rd load) { - float v0, v1, v2, v3; + uint32_t u0, u1, u2, u3; asm volatile("tcgen05.ld.sync.aligned.16x256b.x1.b32 {%0, %1, %2, %3}, [%4];" - : "=f"(v0), "=f"(v1), "=f"(v2), "=f"(v3) - : "r"(tb + 8)); + : "=r"(u0), "=r"(u1), "=r"(u2), "=r"(u3) : "r"(tb + 8)); asm volatile("tcgen05.wait::ld.sync.aligned;"); load_count++; - if (lane == 0) { - results[12] = v0; // row 0, col 8 = 0*10+8 = 8.0 - results[13] = v1; // row 1, col 8 = 10+8 = 18.0? No, 1*10+0=10 + 8 = 18 - results[14] = v2; - results[15] = v3; - } + float v0, v1, v2, v3; + memcpy(&v0, &u0, 4); memcpy(&v1, &u1, 4); + memcpy(&v2, &u2, 4); memcpy(&v3, &u3, 4); + + if (lane == 0) { results[12]=v0; results[13]=v1; results[14]=v2; results[15]=v3; } } - // Store load count (if we get here, loads didn't crash) if (lane == 0) results[16] = (float)load_count; - tmem_dealloc(tb, TMEM_N); } @@ -145,7 +115,7 @@ int main() { cudaError_t err = cudaDeviceSynchronize(); if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); - printf("16x256b.x1 loads CRASHED (likely on 2nd or 3rd call)\n"); + printf("16x256b.x1 loads CRASHED\n"); cudaFree(d_r); return 1; } @@ -155,20 +125,14 @@ int main() { printf("Load count: %d (3 loads completed = no crash)\n\n", (int)h[16]); - printf("Column 0, lane 0 (expect rows 0-3 = 0,10,20,30): %.1f %.1f %.1f %.1f\n", - h[0], h[1], h[2], h[3]); - printf("Column 0, lane 1 (expect rows 4-7 = 40,50,60,70): %.1f %.1f %.1f %.1f\n", - h[4], h[5], h[6], h[7]); - printf("Column 1, lane 0 (expect rows 0-3 = 1,11,21,31): %.1f %.1f %.1f %.1f\n", - h[8], h[9], h[10], h[11]); - printf("Column 8, lane 0 (expect row 0 col 8 = 8, row 1 col 8 = 18, etc): %.1f %.1f %.1f %.1f\n", - h[12], h[13], h[14], h[15]); + printf("Col 0, lane 0 (expect rows 0-3: 0,10,20,30): %.1f %.1f %.1f %.1f\n", h[0], h[1], h[2], h[3]); + printf("Col 0, lane 1 (expect rows 4-7: 40,50,60,70): %.1f %.1f %.1f %.1f\n", h[4], h[5], h[6], h[7]); + printf("Col 1, lane 0 (expect rows 0-3 at col 1: 1,11,21,31): %.1f %.1f %.1f %.1f\n", h[8], h[9], h[10], h[11]); + printf("Col 8, lane 0 (expect row 0 col 8 = 8, row 1 col 8 = 18, etc): %.1f %.1f %.1f %.1f\n", h[12], h[13], h[14], h[15]); - // Verify: column 0, lane 0 should give [0, 10, 20, 30] - int pass = (fabsf(h[0] - 0.0f) < 0.01f) && (fabsf(h[1] - 10.0f) < 0.01f) && - (fabsf(h[2] - 20.0f) < 0.01f) && (fabsf(h[3] - 30.0f) < 0.01f); - - printf("\nResult: %s\n", pass ? "16x256b.x1 LOADS work multiple times!" : "Data mismatch"); + int pass = (fabsf(h[0]-0.0f)<0.01f) && (fabsf(h[1]-10.0f)<0.01f) && + (fabsf(h[2]-20.0f)<0.01f) && (fabsf(h[3]-30.0f)<0.01f) && (int)h[16]==3; + printf("\nResult: %s\n", pass ? "16x256b.x1 LOADS work multiple times!" : "Data mismatch or crash"); cudaFree(d_r); return pass ? 0 : 1; }