nvfp4-megamoe-kernel/tests/unit/test_tma_proper.cu

/**
 * Proper TMA load test using CUtensorMap for a (128, 16) BF16 tile.
 *
 * Step 1: Create CUtensorMap on host
 * Step 2: Pass to kernel, use cp.async.bulk.tensor.2d to load
 * Step 3: Verify the loaded data matches the original
 * Step 4: Use the loaded data with UMMA SW128 descriptor
 *
 * This proves the TMA + SW128 pipeline works for FMHA.
 */

#include <cuda_runtime.h>
#include <cuda.h>
#include <cstdio>
#include <cstring>
#include <cmath>

typedef unsigned short bf16_t;

static bf16_t f32_to_bf16_host(float f) { uint32_t u; memcpy(&u,&f,4); return (uint16_t)(u>>16); }
static float bf16_to_f32_host(bf16_t h) { uint32_t u=(uint32_t)h<<16; float f; memcpy(&f,&u,4); return f; }

constexpr int ROWS = 128, COLS = 16;
constexpr int TILE_ROWS = 128, TILE_COLS = 16;  // TMA tile dimensions

// Kernel: load one tile via TMA, copy to output for verification
__global__ void __launch_bounds__(32)
test_tma_load_kernel(CUtensorMap* tma_desc, bf16_t* gmem_dst) {
    extern __shared__ char sbuf[];
    uint64_t* sMbar = (uint64_t*)sbuf;
    bf16_t* sData = (bf16_t*)(((uintptr_t)(sbuf + 8) + 127) & ~(uintptr_t)127);  // 128B aligned

    // Init mbarrier (1 thread)
    if (threadIdx.x == 0) {
        asm volatile("mbarrier.init.shared.b64 [%0], %1;"
            :: "r"((uint32_t)__cvta_generic_to_shared(sMbar)), "r"(1));
    }
    __syncthreads();

    // Issue TMA load (1 thread)
    if (threadIdx.x == 0) {
        // cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
        // [smem_dst], [tma_desc, {coord_x, coord_y}], [mbarrier]
        uint32_t smem_addr = (uint32_t)__cvta_generic_to_shared(sData);
        uint32_t mbar_addr = (uint32_t)__cvta_generic_to_shared(sMbar);
        // coord: (col=0, row=0) for the first tile
        asm volatile(
            "cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
            "[%0], [%1, {%3, %4}], [%2];"
            :: "r"(smem_addr),
               "l"((uint64_t)tma_desc),
               "r"(mbar_addr),
               "r"(0),  // coord_x (column)
               "r"(0)   // coord_y (row)
            : "memory"
        );
    }
    __syncthreads();

    // Wait for TMA completion
    if (threadIdx.x == 0) {
        int phase = 0;
        asm volatile(
            "{\n\t"
            ".reg .pred p;\n\t"
            "LOOP:\n\t"
            "mbarrier.try_wait.parity.shared.b64 p, [%0], %1;\n\t"
            "@p bra DONE;\n\t"
            "bra LOOP;\n\t"
            "DONE:\n\t"
            "}"
            :: "r"((uint32_t)__cvta_generic_to_shared(sMbar)), "r"(phase)
            : "memory"
        );
    }
    __syncthreads();

    // Copy SMEM to GMEM for verification
    for (int i = threadIdx.x; i < ROWS * COLS; i += 32) {
        gmem_dst[i] = sData[i];
    }
}

int main() {
    printf("=== TMA Load Test with CUtensorMap ===\n");
    constexpr int TOTAL = ROWS * COLS;
    constexpr int DATA_BYTES = TOTAL * sizeof(bf16_t);

    // Allocate host data
    bf16_t* h_src = (bf16_t*)malloc(DATA_BYTES);
    bf16_t* h_dst = (bf16_t*)calloc(TOTAL, sizeof(bf16_t));
    srand(42);
    for (int i = 0; i < TOTAL; i++) h_src[i] = f32_to_bf16_host((float)(rand()%100)/100.0f - 0.5f);

    // Allocate device memory
    bf16_t *d_src, *d_dst;
    cudaMalloc(&d_src, DATA_BYTES);
    cudaMalloc(&d_dst, DATA_BYTES);
    cudaMemcpy(d_src, h_src, DATA_BYTES, cudaMemcpyHostToDevice);

    // Create CUtensorMap for a (ROWS, COLS) BF16 tensor
    // Layout: (rows, cols) = (128, 16) in row-major
    // TMA tile: (128, 16) — one tile covers the whole matrix
    uint64_t gdim[] = {(uint64_t)COLS, (uint64_t)ROWS};
    uint64_t gstr[] = {1, (uint64_t)COLS};
    uint32_t tdim[] = {TILE_COLS, TILE_ROWS};
    uint32_t tstr[] = {1, TILE_COLS};

    CUtensorMap tma_desc_host;
    CUresult res = cuTensorMapEncodeTiled(
        &tma_desc_host,
        CU_TENSOR_MAP_DATA_TYPE_UINT16,
        2,
        d_src,
        gdim, gstr, tdim, tstr,
        CU_TENSOR_MAP_INTERLEAVE_NONE,
        CU_TENSOR_MAP_SWIZZLE_NONE,       // No swizzle for now
        CU_TENSOR_MAP_L2_PROMOTION_NONE,
        CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
    );

    if (res != CUDA_SUCCESS) {
        printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
        return 1;
    }
    printf("CUtensorMap created successfully\n");

    // Copy tensor map to device (must be in GMEM for the kernel to read)
    CUtensorMap* d_tma_desc;
    cudaMalloc(&d_tma_desc, sizeof(CUtensorMap));
    cudaMemcpy(d_tma_desc, &tma_desc_host, sizeof(CUtensorMap), cudaMemcpyHostToDevice);

    // Launch kernel
    int smem = 8 + 128 + DATA_BYTES + 256;  // mbar + alignment + data + padding
    test_tma_load_kernel<<<1, 32, smem>>>(d_tma_desc, d_dst);

    cudaError_t err = cudaDeviceSynchronize();
    if (err != cudaSuccess) {
        printf("CUDA ERROR: %s\n", cudaGetErrorString(err));
        // Try to get more info
        cudaError_t launch_err = cudaGetLastError();
        printf("Last error: %s\n", cudaGetErrorString(launch_err));
        return 1;
    }

    cudaMemcpy(h_dst, d_dst, DATA_BYTES, cudaMemcpyDeviceToHost);

    // Verify
    int mismatches = 0;
    float max_diff = 0;
    for (int i = 0; i < TOTAL; i++) {
        if (h_src[i] != h_dst[i]) mismatches++;
        float diff = fabsf(bf16_to_f32_host(h_src[i]) - bf16_to_f32_host(h_dst[i]));
        max_diff = fmaxf(max_diff, diff);
    }
    printf("Mismatches: %d / %d, Max diff: %.6f\n", mismatches, TOTAL, max_diff);
    printf("Test %s\n", mismatches == 0 ? "PASSED" : "FAILED");

    cudaFree(d_src); cudaFree(d_dst); cudaFree(d_tma_desc);
    free(h_src); free(h_dst);
    return mismatches == 0 ? 0 : 1;
}