P4: fix TMA descriptor dump (cuuint64_t dims, proper CUtensorMap API)

This commit is contained in:
2026-05-30 08:32:34 +00:00
parent 4df6ea2d8c
commit 3943be6063

View File

@@ -1,47 +1,44 @@
/**
* P4: Dump TMA descriptor bytes for comparison.
*
* Creates a TMA descriptor using cuTensorMapEncodeTiled for a (128,16) BF16
* Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16
* tensor with various swizzle modes, and dumps the 128-byte descriptor.
*
* The working CuTeDSL path on the B200 can be used to create a matching
* descriptor via a small Python script. This test dumps the raw CUDA path.
*
* Usage:
* fire_b200_cuda_test tests/unit/test_p4_tma_descriptor_dump.cu
*/
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cstdio>
#include <cstdint>
#include <cstring>
__global__ void dummy_kernel() {}
int main() {
// Allocate a (128, 16) BF16 tensor on GPU
const int ROWS = 128;
const int COLS = 16;
const size_t SIZE = ROWS * COLS * 2; // 4096 bytes
const size_t SIZE = ROWS * COLS * 2;
void* d_ptr;
cudaMalloc(&d_ptr, SIZE);
cudaMemset(d_ptr, 0, SIZE);
// cuTensorMapEncodeTiled signature:
// CUtensorMap*, cuuint32_t rank, CUtensorMapDataType,
// void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*,
// CUtensorMapInterleave, CUtensorMapSwizzle,
// CUtensorMapL2Promotion, CUtensorMapOOBFill
cuuint64_t tensorDims[] = {ROWS, COLS};
cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides
cuuint32_t boxDims[] = {16, 16};
cuuint32_t elementStrides[] = {1, 1};
CUtensorMap tma_desc;
CUresult res;
// ==================================================================
// Descriptor 1: NO swizzle (the one that hangs in cp.async.bulk.tensor)
// Descriptor 1: NO swizzle
// ==================================================================
uint32_t tensorDims[] = {ROWS, COLS};
uint64_t globalStrides[] = {COLS * 2, 2}; // bytes
uint32_t boxDims[] = {16, 16};
uint32_t elementStrides[] = {1, 1};
res = cuTensorMapEncodeTiled(
&tma_desc,
2, // rank
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
@@ -56,7 +53,7 @@ int main() {
printf("=== Descriptor 1: NO swizzle ===\n");
if (res != CUDA_SUCCESS) {
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
@@ -67,7 +64,7 @@ int main() {
}
// ==================================================================
// Descriptor 2: SWIZZLE_128B (the one CuTeDSL uses for canonical layout)
// Descriptor 2: SWIZZLE_128B
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
@@ -86,7 +83,7 @@ int main() {
printf("\n=== Descriptor 2: SWIZZLE_128B ===\n");
if (res != CUDA_SUCCESS) {
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
@@ -97,40 +94,7 @@ int main() {
}
// ==================================================================
// Descriptor 3: Different globalStrides (element strides vs byte strides)
// CUDA 13 might need element strides, not byte strides
// ==================================================================
// Try with globalStrides in ELEMENTS (not bytes)
uint64_t globalStrides_elem[] = {COLS, 1}; // elements, not bytes
res = cuTensorMapEncodeTiled(
&tma_desc,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
globalStrides_elem,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_NONE
);
printf("\n=== Descriptor 3: NO swizzle, element strides (not byte) ===\n");
if (res != CUDA_SUCCESS) {
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
printf("\n");
}
}
// ==================================================================
// Descriptor 4: OOB_FILL_ZERO (maybe the hang is caused by OOB access)
// Descriptor 3: NO swizzle, OOB_FILL_ZERO
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
@@ -147,9 +111,9 @@ int main() {
CU_TENSOR_MAP_OOB_FILL_ZERO
);
printf("\n=== Descriptor 4: NO swizzle, OOB_FILL_ZERO ===\n");
printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n");
if (res != CUDA_SUCCESS) {
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
@@ -160,16 +124,39 @@ int main() {
}
// ==================================================================
// Also test: actual TMA load with descriptor 1 (the one that hangs)
// Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO
// ==================================================================
// We can't easily do this in a host test. The hang happens inside
// a kernel when cp.async.bulk.tensor.2d is issued with this descriptor.
// That test exists separately.
res = cuTensorMapEncodeTiled(
&tma_desc,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
globalStrides,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_128B,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_ZERO
);
printf("\n=== Test: TMA load with descriptor 1 (NO swizzle) ===\n");
// Quick test: launch a kernel that does TMA load
// This is the test that hangs — we just verify the descriptor creates OK
printf("Descriptor creation OK. TMA load test requires separate kernel.\n");
printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n");
if (res != CUDA_SUCCESS) {
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
printf("\n");
}
}
// Also: test an actual TMA load to see which descriptors work
printf("\n=== TMA load test (see separate kernel test) ===\n");
printf("Descriptor creation OK for all variants.\n");
printf("Actual TMA load behavior requires launching a kernel.\n");
cudaFree(d_ptr);
printf("\nPASSED\n");