P4: fix TMA descriptor dump API order (dtype before rank)
This commit is contained in:
@@ -1,8 +1,6 @@
|
||||
/**
|
||||
* P4: Dump TMA descriptor bytes for comparison.
|
||||
*
|
||||
* Creates TMA descriptors using cuTensorMapEncodeTiled for a (128,16) BF16
|
||||
* tensor with various swizzle modes, and dumps the 128-byte descriptor.
|
||||
* Uses CUDA Driver API directly.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
@@ -10,6 +8,14 @@
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
/* CUDA 13.2 enum values for cuTensorMapEncodeTiled:
|
||||
* CUtensorMapDataType: 0=F16, 6=BF16
|
||||
* CUtensorMapInterleave: 0=NONE, 1=16B, 2=32B
|
||||
* CUtensorMapSwizzle: 0=NONE, 1=4B, 2=32B, 3=64B, 4=128B
|
||||
* CUtensorMapL2Promotion: 0=NONE, 1=64B, 2=128B, 3=256B
|
||||
* CUtensorMapOOBFill: 0=NONE, 1=ZERO
|
||||
*/
|
||||
|
||||
int main() {
|
||||
const int ROWS = 128;
|
||||
const int COLS = 16;
|
||||
@@ -19,43 +25,34 @@ int main() {
|
||||
cudaMalloc(&d_ptr, SIZE);
|
||||
cudaMemset(d_ptr, 0, SIZE);
|
||||
|
||||
// cuTensorMapEncodeTiled signature:
|
||||
// CUtensorMap*, cuuint32_t rank, CUtensorMapDataType,
|
||||
// void*, cuuint64_t*, cuuint64_t*, cuuint32_t*, cuuint32_t*,
|
||||
// CUtensorMapInterleave, CUtensorMapSwizzle,
|
||||
// CUtensorMapL2Promotion, CUtensorMapOOBFill
|
||||
|
||||
cuuint64_t tensorDims[] = {ROWS, COLS};
|
||||
cuuint64_t globalStrides[] = {COLS * 2, 2}; // byte strides
|
||||
cuuint64_t tensorDims[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS};
|
||||
cuuint64_t globalStrides[] = {(cuuint64_t)(COLS * 2), (cuuint64_t)2};
|
||||
cuuint32_t boxDims[] = {16, 16};
|
||||
cuuint32_t elementStrides[] = {1, 1};
|
||||
|
||||
CUtensorMap tma_desc;
|
||||
CUresult res;
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 1: NO swizzle
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
(CUtensorMapDataType)6, // BF16
|
||||
2, // rank
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_NONE,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_NONE
|
||||
(CUtensorMapInterleave)0, // NONE
|
||||
(CUtensorMapSwizzle)0, // NONE
|
||||
(CUtensorMapL2Promotion)0, // NONE
|
||||
(CUtensorMapOOBFill)0 // NONE
|
||||
);
|
||||
|
||||
printf("=== Descriptor 1: NO swizzle ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
|
||||
else {
|
||||
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
@@ -63,29 +60,26 @@ int main() {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 2: SWIZZLE_128B
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
(CUtensorMapDataType)6,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_NONE
|
||||
(CUtensorMapInterleave)0,
|
||||
(CUtensorMapSwizzle)4, // 128B
|
||||
(CUtensorMapL2Promotion)0,
|
||||
(CUtensorMapOOBFill)0
|
||||
);
|
||||
|
||||
printf("\n=== Descriptor 2: SWIZZLE_128B ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
|
||||
else {
|
||||
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
@@ -93,29 +87,26 @@ int main() {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 3: NO swizzle, OOB_FILL_ZERO
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
(CUtensorMapDataType)6,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_NONE,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_ZERO
|
||||
(CUtensorMapInterleave)0,
|
||||
(CUtensorMapSwizzle)0,
|
||||
(CUtensorMapL2Promotion)0,
|
||||
(CUtensorMapOOBFill)1 // ZERO
|
||||
);
|
||||
|
||||
printf("\n=== Descriptor 3: NO swizzle, OOB_FILL_ZERO ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
|
||||
else {
|
||||
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
@@ -123,29 +114,26 @@ int main() {
|
||||
}
|
||||
}
|
||||
|
||||
// ==================================================================
|
||||
// Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO
|
||||
// ==================================================================
|
||||
res = cuTensorMapEncodeTiled(
|
||||
&tma_desc,
|
||||
(CUtensorMapDataType)6,
|
||||
2,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr,
|
||||
tensorDims,
|
||||
globalStrides,
|
||||
boxDims,
|
||||
elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE,
|
||||
CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE,
|
||||
CU_TENSOR_MAP_OOB_FILL_ZERO
|
||||
(CUtensorMapInterleave)0,
|
||||
(CUtensorMapSwizzle)4,
|
||||
(CUtensorMapL2Promotion)0,
|
||||
(CUtensorMapOOBFill)1
|
||||
);
|
||||
|
||||
printf("\n=== Descriptor 4: SWIZZLE_128B, OOB_FILL_ZERO ===\n");
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf("FAILED: %d\n", res);
|
||||
} else {
|
||||
const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
if (res != CUDA_SUCCESS) { printf("FAILED: %d\n", res); }
|
||||
else {
|
||||
auto* bytes = reinterpret_cast<const uint8_t*>(&tma_desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
printf("[%3d-%3d]: ", i, i+15);
|
||||
for (int j = 0; j < 16; j++) printf("%02x ", bytes[i+j]);
|
||||
@@ -153,11 +141,6 @@ int main() {
|
||||
}
|
||||
}
|
||||
|
||||
// Also: test an actual TMA load to see which descriptors work
|
||||
printf("\n=== TMA load test (see separate kernel test) ===\n");
|
||||
printf("Descriptor creation OK for all variants.\n");
|
||||
printf("Actual TMA load behavior requires launching a kernel.\n");
|
||||
|
||||
cudaFree(d_ptr);
|
||||
printf("\nPASSED\n");
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user