195 lines
7.8 KiB
Plaintext
195 lines
7.8 KiB
Plaintext
/**
|
|
* Debug V TMA loads. Load a (16,16) tile from V=(HD,SK) via TMA,
|
|
* compare against direct GMEM read.
|
|
*/
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <cuda.h>
|
|
#include <cstdio>
|
|
#include <cmath>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
|
|
#ifndef HD_VAL
|
|
#define HD_VAL 64
|
|
#endif
|
|
|
|
typedef unsigned short bf16_t;
|
|
static bf16_t f32_to_bf16_host(float f) { uint32_t u; memcpy(&u,&f,4); return (uint16_t)(u>>16); }
|
|
static float bf16_to_f32_host(bf16_t h) { uint32_t u=(uint32_t)h<<16; float f; memcpy(&f,&u,4); return f; }
|
|
|
|
constexpr int HD = HD_VAL;
|
|
constexpr int SK = 128;
|
|
|
|
__device__ __forceinline__ void tma_mbarrier_init(uint32_t smem_mbar, uint32_t count) {
|
|
asm volatile("mbarrier.init.shared::cta.b64 [%0], %1;" :: "r"(smem_mbar), "r"(count));
|
|
}
|
|
__device__ __forceinline__ void tma_mbarrier_arrive_expect_tx(uint32_t smem_mbar, uint32_t tx_bytes) {
|
|
asm volatile("mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 _, [%0], %1;"
|
|
:: "r"(smem_mbar), "r"(tx_bytes) : "memory");
|
|
}
|
|
__device__ __forceinline__ void tma_load_2d(uint32_t dst, uint64_t desc, uint32_t mbar, int cx, int cy) {
|
|
asm volatile("cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
|
|
"[%0], [%1, {%3, %4}], [%2];" :: "r"(dst), "l"(desc), "r"(mbar), "r"(cx), "r"(cy) : "memory");
|
|
}
|
|
__device__ __forceinline__ void tma_mbarrier_wait(uint32_t smem_mbar, int phase) {
|
|
asm volatile("{\n\t.reg .pred P1;\n\tLAB_WAIT:mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P1, [%0], %1, %2;\n\t@P1 bra.uni DONE;\n\tbra.uni LAB_WAIT;\n\tDONE:\n\t}" :: "r"(smem_mbar), "r"(phase), "r"(0x989680) : "memory");
|
|
}
|
|
|
|
// Test: load V tile at (d_base, col_start) via TMA, compare with direct read
|
|
__global__ void __launch_bounds__(32)
|
|
test_v_tma_kernel(
|
|
bf16_t* __restrict__ verify_buf,
|
|
CUtensorMap* __restrict__ tma_v,
|
|
int d_base, int col_start
|
|
) {
|
|
const int tid = threadIdx.x;
|
|
const int lane = tid % 32;
|
|
|
|
extern __shared__ __align__(128) char sbuf[];
|
|
bf16_t* sTmaBuf = (bf16_t*)(sbuf);
|
|
uint64_t* sMbar = (uint64_t*)(sbuf + 512);
|
|
|
|
if (tid == 0) {
|
|
tma_mbarrier_init((uint32_t)__cvta_generic_to_shared(sMbar), 1);
|
|
asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory");
|
|
}
|
|
__syncthreads();
|
|
|
|
uint32_t mbar_addr = (uint32_t)__cvta_generic_to_shared(sMbar);
|
|
if (lane == 0) {
|
|
tma_load_2d((uint32_t)__cvta_generic_to_shared(sTmaBuf), (uint64_t)tma_v, mbar_addr, col_start, d_base);
|
|
tma_mbarrier_arrive_expect_tx(mbar_addr, 16 * 16 * 2); // (16,16) BF16 = 512 bytes
|
|
}
|
|
tma_mbarrier_wait(mbar_addr, 0);
|
|
__syncthreads();
|
|
|
|
// Copy to verify buffer
|
|
for (int i = lane; i < 256; i += 32) verify_buf[i] = sTmaBuf[i];
|
|
}
|
|
|
|
inline bool create_tma_desc_2d_bf16(
|
|
CUtensorMap* out, const void* ptr,
|
|
uint64_t rows, uint64_t cols,
|
|
uint32_t tile_rows, uint32_t tile_cols
|
|
) {
|
|
uint64_t gd[] = {cols, rows}, gs[] = {cols * 2};
|
|
uint32_t td[] = {tile_cols, tile_rows}, ts[] = {1, 1};
|
|
CUresult r = cuTensorMapEncodeTiled(out, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
|
const_cast<void*>(ptr), gd, gs, td, ts,
|
|
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
|
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
|
if (r != CUDA_SUCCESS) { fprintf(stderr, "TMA fail: %d\n", (int)r); return false; }
|
|
int dv=0; cudaDriverGetVersion(&dv);
|
|
if (dv <= 13010 && rows*cols*2 < 131072) reinterpret_cast<uint64_t*>(out)[1] &= ~(1ULL<<21);
|
|
return true;
|
|
}
|
|
|
|
int main() {
|
|
printf("=== V TMA debug (HD=%d, SK=%d) ===\n", HD, SK);
|
|
|
|
// V = (HD, SK) in GMEM, row-major
|
|
bf16_t* h_v = (bf16_t*)calloc(HD * SK, sizeof(bf16_t));
|
|
srand(42);
|
|
for (int i = 0; i < HD * SK; i++) h_v[i] = f32_to_bf16_host((float)(rand()%100)/100.0f - 0.5f);
|
|
|
|
bf16_t *d_v, *d_verify;
|
|
cudaMalloc(&d_v, HD * SK * sizeof(bf16_t));
|
|
cudaMalloc(&d_verify, 256 * sizeof(bf16_t));
|
|
cudaMemcpy(d_v, h_v, HD * SK * sizeof(bf16_t), cudaMemcpyHostToDevice);
|
|
|
|
// TMA descriptor for V: (HD, SK) with tile (16, 16)
|
|
CUtensorMap tma_v; CUtensorMap* d_tma_v;
|
|
bool ok = create_tma_desc_2d_bf16(&tma_v, d_v, HD, SK, 16, 16);
|
|
printf("TMA desc creation: %s\n", ok ? "OK" : "FAILED");
|
|
if (!ok) return 1;
|
|
cudaMalloc(&d_tma_v, sizeof(CUtensorMap));
|
|
cudaMemcpy(d_tma_v, &tma_v, sizeof(CUtensorMap), cudaMemcpyHostToDevice);
|
|
|
|
// Test: load V tile at (d_base=0, col_start=0) — V[0:16, 0:16]
|
|
printf("\n--- Test 1: d_base=0, col_start=0 ---\n");
|
|
{
|
|
cudaMemset(d_verify, 0, 256 * sizeof(bf16_t));
|
|
size_t smem = 512 + 128 + 16;
|
|
test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 0, 0);
|
|
cudaError_t err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; }
|
|
|
|
bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t));
|
|
cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost);
|
|
|
|
// V[0:16, 0:16] should match
|
|
int bad = 0;
|
|
for (int r = 0; r < 16; r++) {
|
|
for (int c = 0; c < 16; c++) {
|
|
bf16_t expected = h_v[r * SK + c]; // V is (HD, SK), row r has stride SK
|
|
bf16_t got = h_verify[r * 16 + c]; // TMA output is (16,16) row-major
|
|
if (got != expected) {
|
|
if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got);
|
|
bad++;
|
|
}
|
|
}
|
|
}
|
|
printf(" Mismatches: %d / 256\n", bad);
|
|
free(h_verify);
|
|
}
|
|
|
|
// Test 2: d_base=16, col_start=16 — V[16:32, 16:32]
|
|
printf("\n--- Test 2: d_base=16, col_start=16 ---\n");
|
|
{
|
|
cudaMemset(d_verify, 0, 256 * sizeof(bf16_t));
|
|
size_t smem = 512 + 128 + 16;
|
|
test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 16, 16);
|
|
cudaError_t err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; }
|
|
|
|
bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t));
|
|
cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost);
|
|
|
|
int bad = 0;
|
|
for (int r = 0; r < 16; r++) {
|
|
for (int c = 0; c < 16; c++) {
|
|
bf16_t expected = h_v[(16+r) * SK + (16+c)];
|
|
bf16_t got = h_verify[r * 16 + c];
|
|
if (got != expected) {
|
|
if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got);
|
|
bad++;
|
|
}
|
|
}
|
|
}
|
|
printf(" Mismatches: %d / 256\n", bad);
|
|
free(h_verify);
|
|
}
|
|
|
|
// Test 3: d_base=48, col_start=96 — V[48:64, 96:112] (near edge for HD=64)
|
|
printf("\n--- Test 3: d_base=48, col_start=96 ---\n");
|
|
{
|
|
cudaMemset(d_verify, 0, 256 * sizeof(bf16_t));
|
|
size_t smem = 512 + 128 + 16;
|
|
test_v_tma_kernel<<<1, 32, smem>>>(d_verify, d_tma_v, 48, 96);
|
|
cudaError_t err = cudaDeviceSynchronize();
|
|
if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; }
|
|
|
|
bf16_t* h_verify = (bf16_t*)malloc(256 * sizeof(bf16_t));
|
|
cudaMemcpy(h_verify, d_verify, 256 * sizeof(bf16_t), cudaMemcpyDeviceToHost);
|
|
|
|
int bad = 0;
|
|
for (int r = 0; r < 16; r++) {
|
|
for (int c = 0; c < 16; c++) {
|
|
bf16_t expected = h_v[(48+r) * SK + (96+c)];
|
|
bf16_t got = h_verify[r * 16 + c];
|
|
if (got != expected) {
|
|
if (bad < 5) printf(" [%d,%d]: expected %u got %u\n", r, c, (unsigned)expected, (unsigned)got);
|
|
bad++;
|
|
}
|
|
}
|
|
}
|
|
printf(" Mismatches: %d / 256\n", bad);
|
|
free(h_verify);
|
|
}
|
|
|
|
cudaFree(d_v); cudaFree(d_verify); cudaFree(d_tma_v);
|
|
free(h_v);
|
|
return 0;
|
|
}
|