From 680d2ebf644aaf318d8ffaa4d3c2a87d35689827 Mon Sep 17 00:00:00 2001 From: biondizzle Date: Fri, 29 May 2026 22:42:46 +0000 Subject: [PATCH] =?UTF-8?q?test:=20V=20TMA=20diagnostic=20=E2=80=94=20isol?= =?UTF-8?q?ate=20V=20TMA=20descriptor=20issue?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/unit/test_v_tma.cu | 194 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 tests/unit/test_v_tma.cu diff --git a/tests/unit/test_v_tma.cu b/tests/unit/test_v_tma.cu new file mode 100644 index 00000000..cd552ce1 --- /dev/null +++ b/tests/unit/test_v_tma.cu @@ -0,0 +1,194 @@ +/** + * Debug V TMA loads. Load a (16,16) tile from V=(HD,SK) via TMA, + * compare against direct GMEM read. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef HD_VAL +#define HD_VAL 64 +#endif + +typedef unsigned short bf16_t; +static bf16_t f32_to_bf16_host(float f) { uint32_t u; memcpy(&u,&f,4); return (uint16_t)(u>>16); } +static float bf16_to_f32_host(bf16_t h) { uint32_t u=(uint32_t)h<<16; float f; memcpy(&f,&u,4); return f; } + +constexpr int HD = HD_VAL; +constexpr int SK = 128; + +__device__ __forceinline__ void tma_mbarrier_init(uint32_t smem_mbar, uint32_t count) { + asm volatile("mbarrier.init.shared::cta.b64 [%0], %1;" :: "r"(smem_mbar), "r"(count)); +} +__device__ __forceinline__ void tma_mbarrier_arrive_expect_tx(uint32_t smem_mbar, uint32_t tx_bytes) { + asm volatile("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 _, [%0], %1;" + :: "r"(smem_mbar), "r"(tx_bytes) : "memory"); +} +__device__ __forceinline__ void tma_load_2d(uint32_t dst, uint64_t desc, uint32_t mbar, int cx, int cy) { + asm volatile("cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes " + "[%0], [%1, {%3, %4}], [%2];" :: "r"(dst), "l"(desc), "r"(mbar), "r"(cx), "r"(cy) : "memory"); +} +__device__ __forceinline__ void tma_mbarrier_wait(uint32_t smem_mbar, int phase) { + asm volatile("{\n\t.reg .pred P1;\n\tLAB_WAIT:mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P1, [%0], %1, %2;\n\t@P1 bra.uni DONE;\n\tbra.uni LAB_WAIT;\n\tDONE:\n\t}" :: "r"(smem_mbar), "r"(phase), "r"(0x989680) : "memory"); +} + +// Test: load V tile at (d_base, col_start) via TMA, compare with direct read +__global__ void __launch_bounds__(32) +test_v_tma_kernel( + bf16_t* __restrict__ verify_buf, + CUtensorMap* __restrict__ tma_v, + int d_base, int col_start +) { + const int tid = threadIdx.x; + const int lane = tid % 32; + + extern __shared__ __align__(128) char sbuf[]; + bf16_t* sTmaBuf = (bf16_t*)(sbuf); + uint64_t* sMbar = (uint64_t*)(sbuf + 512); + + if (tid == 0) { + tma_mbarrier_init((uint32_t)__cvta_generic_to_shared(sMbar), 1); + asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory"); + } + __syncthreads(); + + uint32_t mbar_addr = (uint32_t)__cvta_generic_to_shared(sMbar); + if (lane == 0) { + tma_load_2d((uint32_t)__cvta_generic_to_shared(sTmaBuf), (uint64_t)tma_v, mbar_addr, col_start, d_base); + tma_mbarrier_arrive_expect_tx(mbar_addr, 16 * 16 * 2); // (16,16) BF16 = 512 bytes + } + tma_mbarrier_wait(mbar_addr, 0); + __syncthreads(); + + // Copy to verify buffer + for (int i = lane; i < 256; i += 32) verify_buf[i] = sTmaBuf[i]; +} + +inline bool create_tma_desc_2d_bf16( + CUtensorMap* out, const void* ptr, + uint64_t rows, uint64_t cols, + uint32_t tile_rows, uint32_t tile_cols +) { + uint64_t gd[] = {cols, rows}, gs[] = {cols * 2}; + uint32_t td[] = {tile_cols, tile_rows}, ts[] = {1, 1}; + CUresult r = cuTensorMapEncodeTiled(out, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2, + const_cast(ptr), gd, gs, td, ts, + CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE, + CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE); + if (r != CUDA_SUCCESS) { fprintf(stderr, "TMA fail: %d\n", (int)r); return false; } + int dv=0; cudaDriverGetVersion(&dv); + if (dv <= 13010 && rows*cols*2 < 131072) reinterpret_cast(out)[1] &= ~(1ULL<<21); + return true; +} + +int main() { + printf("=== V TMA debug (HD=%d, SK=%d) ===\n", HD, SK); + + // V = (HD, SK) in GMEM, row-major + bf16_t* h_v = (bf16_t*)calloc(HD * SK, sizeof(bf16_t)); + srand(42); + for (int i = 0; i < HD * SK; i++) h_v[i] = f32_to_bf16_host((float)(rand()%100)/100.0f - 0.5f); + + bf16_t *d_v, *d_verify; + cudaMalloc(&d_v, HD * SK * sizeof(bf16_t)); + cudaMalloc(&d_verify, 256 * sizeof(bf16_t)); + cudaMemcpy(d_v, h_v, HD * SK * sizeof(bf16_t), cudaMemcpyHostToDevice); + + // TMA descriptor for V: (HD, SK) with tile (16, 16) + CUtensorMap tma_v; CUtensorMap* d_tma_v; + bool ok = create_tma_desc_2d_bf16(&tma_v, d_v, HD, SK, 16, 16); + printf("TMA desc creation: %s\n", ok ? "OK" : "FAILED"); + if (!ok) return 1; + cudaMalloc(&d_tma_v, sizeof(CUtensorMap)); + cudaMemcpy(d_tma_v, &tma_v, sizeof(CUtensorMap), cudaMemcpyHostToDevice); + + // Test: load V tile at (d_base=0, col_start=0) — V[0:16, 0:16] + printf("\n--- Test 1: d_base=0, col_start=0 ---\n"); + { + cudaMemset(d_verify, 0, 256 * sizeof(bf16_t)); + size_t smem = 512 + 128 + 16; + test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 0, 0); + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } + + bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t)); + cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost); + + // V[0:16, 0:16] should match + int bad = 0; + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 16; c++) { + bf16_t expected = h_v[r * SK + c]; // V is (HD, SK), row r has stride SK + bf16_t got = h_verify[r * 16 + c]; // TMA output is (16,16) row-major + if (got != expected) { + if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got); + bad++; + } + } + } + printf(" Mismatches: %d / 256\n", bad); + free(h_verify); + } + + // Test 2: d_base=16, col_start=16 — V[16:32, 16:32] + printf("\n--- Test 2: d_base=16, col_start=16 ---\n"); + { + cudaMemset(d_verify, 0, 256 * sizeof(bf16_t)); + size_t smem = 512 + 128 + 16; + test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 16, 16); + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } + + bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t)); + cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost); + + int bad = 0; + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 16; c++) { + bf16_t expected = h_v[(16+r) * SK + (16+c)]; + bf16_t got = h_verify[r * 16 + c]; + if (got != expected) { + if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got); + bad++; + } + } + } + printf(" Mismatches: %d / 256\n", bad); + free(h_verify); + } + + // Test 3: d_base=48, col_start=96 — V[48:64, 96:112] (near edge for HD=64) + printf("\n--- Test 3: d_base=48, col_start=96 ---\n"); + { + cudaMemset(d_verify, 0, 256 * sizeof(bf16_t)); + size_t smem = 512 + 128 + 16; + test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 48, 96); + cudaError_t err = cudaDeviceSynchronize(); + if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } + + bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t)); + cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost); + + int bad = 0; + for (int r = 0; r < 16; r++) { + for (int c = 0; c < 16; c++) { + bf16_t expected = h_v[(48+r) * SK + (96+c)]; + bf16_t got = h_verify[r * 16 + c]; + if (got != expected) { + if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got); + bad++; + } + } + } + printf(" Mismatches: %d / 256\n", bad); + free(h_verify); + } + + cudaFree(d_v); cudaFree(d_verify); cudaFree(d_tma_v); + free(h_v); + return 0; +}