P4: fix TMA descriptor dump (cuuint64_t dims, proper CUtensorMap API)
This commit is contained in:
@@ -1,47 +1,44 @@
|
||||
/**
|
||||
* P4: Dump TMA descriptor bytes for comparison.
|
||||
*
|
||||
* Creates a TMA descriptor using cuTensorMapEncodeTiled for a (128,16) BF16
|
||||
* Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16
|
||||
* tensor with various swizzle modes, and dumps the 128-byte descriptor.
|
||||
*
|
||||
* The working CuTeDSL path on the B200 can be used to create a matching
|
||||
* descriptor via a small Python script. This test dumps the raw CUDA path.
|
||||
*
|
||||
* Usage:
|
||||
* fire_b200_cuda_test tests/unit/test_p4_tma_descriptor_dump.cu
|
||||
*/
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
__global__ void dummy_kernel() {}
|
||||
|
||||
int main() {
|
||||
// Allocate a (128, 16) BF16 tensor on GPU
|
||||
const int ROWS = 128;
|
||||
const int COLS = 16;
|
||||
const size_t SIZE = ROWS * COLS * 2; // 4096 bytes
|
||||
const size_t SIZE = ROWS * COLS * 2;
|
||||
|
||||
void* d_ptr;
|
||||
cudaMalloc(&d_ptr, SIZE);
|
||||
cudaMemset(d_ptr, 0, SIZE);
|
||||
|
||||
// cuTensorMapEncodeTiled signature:
|
||||
// CUtensorMap*, cuuint32_t rank, CUtensorMapDataType,
|
||||
// void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*,
|
||||
// CUtensorMapInterleave, CUtensorMapSwizzle,
|
||||
// CUtensorMapL2Promotion, CUtensorMapOOBFill
|
||||
|
||||
cuuint64_t tensorDims[] = {ROWS, COLS};
|
||||
cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides
|
||||
cuuint32_t boxDims[] = {16, 16};
|
||||
cuuint32_t elementStrides[] = {1, 1};
|
||||
|
||||
CUtensorMap tma_desc;
|
||||
CUresult res;
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 1: NO swizzle (the one that hangs in cp.async.bulk.tensor)
|
||||
// Descriptor 1: NO swizzle
|
||||
// ==================================================================
|
||||
uint32_t tensorDims[] = {ROWS, COLS};
|
||||
uint64_t globalStrides[] = {COLS * 2, 2}; // bytes
|
||||
uint32_t boxDims[] = {16, 16};
|
||||
uint32_t elementStrides[] = {1, 1};
|
||||
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
2, // rank
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
@@ -56,7 +53,7 @@ int main() {
|
||||
|
||||
printf("=== Descriptor 1: NO swizzle ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
@@ -67,7 +64,7 @@ int main() {
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 2: SWIZZLE_128B (the one CuTeDSL uses for canonical layout)
|
||||
// Descriptor 2: SWIZZLE_128B
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
@@ -86,7 +83,7 @@ int main() {
|
||||
|
||||
printf("\n=== Descriptor 2: SWIZZLE_128B ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
@@ -97,40 +94,7 @@ int main() {
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 3: Different globalStrides (element strides vs byte strides)
|
||||
// CUDA 13 might need element strides, not byte strides
|
||||
// ==================================================================
|
||||
// Try with globalStrides in ELEMENTS (not bytes)
|
||||
uint64_t globalStrides_elem[] = {COLS, 1}; // elements, not bytes
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides_elem,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_NONE,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_NONE
|
||||
);
|
||||
|
||||
printf("\n=== Descriptor 3: NO swizzle, element strides (not byte) ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 4: OOB_FILL_ZERO (maybe the hang is caused by OOB access)
|
||||
// Descriptor 3: NO swizzle, OOB_FILL_ZERO
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
@@ -147,9 +111,9 @@ int main() {
|
||||
CU_TENSOR_MAP_OOB_FILL_ZERO
|
||||
);
|
||||
|
||||
printf("\n=== Descriptor 4: NO swizzle, OOB_FILL_ZERO ===\n");
|
||||
printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("cuTensorMapEncodeTiled FAILED: %d\n", res);
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
@@ -160,16 +124,39 @@ int main() {
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Also test: actual TMA load with descriptor 1 (the one that hangs)
|
||||
// Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO
|
||||
// ==================================================================
|
||||
// We can't easily do this in a host test. The hang happens inside
|
||||
// a kernel when cp.async.bulk.tensor.2d is issued with this descriptor.
|
||||
// That test exists separately.
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_ZERO
|
||||
);
|
||||
|
||||
printf("\n=== Test: TMA load with descriptor 1 (NO swizzle) ===\n");
|
||||
// Quick test: launch a kernel that does TMA load
|
||||
// This is the test that hangs — we just verify the descriptor creates OK
|
||||
printf("Descriptor creation OK. TMA load test requires separate kernel.\n");
|
||||
printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Also: test an actual TMA load to see which descriptors work
|
||||
printf("\n=== TMA load test (see separate kernel test) ===\n");
|
||||
printf("Descriptor creation OK for all variants.\n");
|
||||
printf("Actual TMA load behavior requires launching a kernel.\n");
|
||||
|
||||
cudaFree(d_ptr);
|
||||
printf("\nPASSED\n");
|
||||
|
||||
Reference in New Issue
Block a user