P4: fix TMA descriptor dump API order (dtype before rank)

This commit is contained in:
2026-05-30 08:33:12 +00:00
parent 3943be6063
commit 4898a946eb

View File

@@ -1,8 +1,6 @@
/**
* P4: Dump TMA descriptor bytes for comparison.
*
* Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16
* tensor with various swizzle modes, and dumps the 128-byte descriptor.
* Uses CUDA Driver API directly.
*/
#include <cuda.h>
#include <cuda_runtime.h>
@@ -10,6 +8,14 @@
#include <cstdint>
#include <cstring>
/* CUDA 13.2 enum values for cuTensorMapEncodeTiled:
* CUtensorMapDataType: 0=F16, 6=BF16
* CUtensorMapInterleave: 0=NONE, 1=16B, 2=32B
* CUtensorMapSwizzle: 0=NONE, 1=4B, 2=32B, 3=64B, 4=128B
* CUtensorMapL2Promotion: 0=NONE, 1=64B, 2=128B, 3=256B
* CUtensorMapOOBFill: 0=NONE, 1=ZERO
*/
int main() {
const int ROWS = 128;
const int COLS = 16;
@@ -19,43 +25,34 @@ int main() {
cudaMalloc(&d_ptr, SIZE);
cudaMemset(d_ptr, 0, SIZE);
// cuTensorMapEncodeTiled signature:
// CUtensorMap*, cuuint32_t rank, CUtensorMapDataType,
// void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*,
// CUtensorMapInterleave, CUtensorMapSwizzle,
// CUtensorMapL2Promotion, CUtensorMapOOBFill
cuuint64_t tensorDims[] = {ROWS, COLS};
cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides
cuuint64_t tensorDims[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS};
cuuint64_t globalStrides[] = {(cuuint64_t)(COLS * 2), (cuuint64_t)2};
cuuint32_t boxDims[] = {16, 16};
cuuint32_t elementStrides[] = {1, 1};
CUtensorMap tma_desc;
CUresult res;
// ==================================================================
// Descriptor 1: NO swizzle
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
(CUtensorMapDataType)6, // BF16
2, // rank
d_ptr,
tensorDims,
globalStrides,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_NONE
(CUtensorMapInterleave)0, // NONE
(CUtensorMapSwizzle)0, // NONE
(CUtensorMapL2Promotion)0, // NONE
(CUtensorMapOOBFill)0 // NONE
);
printf("=== Descriptor 1: NO swizzle ===\n");
if (res != CUDA_SUCCESS) {
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
else {
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
@@ -63,29 +60,26 @@ int main() {
}
}
// ==================================================================
// Descriptor 2: SWIZZLE_128B
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
(CUtensorMapDataType)6,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
globalStrides,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_128B,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_NONE
(CUtensorMapInterleave)0,
(CUtensorMapSwizzle)4, // 128B
(CUtensorMapL2Promotion)0,
(CUtensorMapOOBFill)0
);
printf("\n=== Descriptor 2: SWIZZLE_128B ===\n");
if (res != CUDA_SUCCESS) {
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
else {
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
@@ -93,29 +87,26 @@ int main() {
}
}
// ==================================================================
// Descriptor 3: NO swizzle, OOB_FILL_ZERO
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
(CUtensorMapDataType)6,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
globalStrides,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_ZERO
(CUtensorMapInterleave)0,
(CUtensorMapSwizzle)0,
(CUtensorMapL2Promotion)0,
(CUtensorMapOOBFill)1 // ZERO
);
printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n");
if (res != CUDA_SUCCESS) {
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
else {
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
@@ -123,29 +114,26 @@ int main() {
}
}
// ==================================================================
// Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO
// ==================================================================
res = cuTensorMapEncodeTiled(
&tma_desc,
(CUtensorMapDataType)6,
2,
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr,
tensorDims,
globalStrides,
boxDims,
elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE,
CU_TENSOR_MAP_SWIZZLE_128B,
CU_TENSOR_MAP_L2_PROMOTION_NONE,
CU_TENSOR_MAP_OOB_FILL_ZERO
(CUtensorMapInterleave)0,
(CUtensorMapSwizzle)4,
(CUtensorMapL2Promotion)0,
(CUtensorMapOOBFill)1
);
printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n");
if (res != CUDA_SUCCESS) {
printf("FAILED: %d\n", res);
} else {
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
else {
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
for (int i = 0; i < 128; i += 16) {
printf("[%3d-%3d]: ", i, i+15);
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
@@ -153,11 +141,6 @@ int main() {
}
}
// Also: test an actual TMA load to see which descriptors work
printf("\n=== TMA load test (see separate kernel test) ===\n");
printf("Descriptor creation OK for all variants.\n");
printf("Actual TMA load behavior requires launching a kernel.\n");
cudaFree(d_ptr);
printf("\nPASSED\n");
return 0;