P4: correct CUDA 13.2 API (dataType before rank, FloatOOBfill, globalDim)
This commit is contained in:
@@ -2,20 +2,17 @@
|
||||
* P4: Dump TMA descriptor bytes for comparison.
|
||||
* CUDA 13.2 compatible.
|
||||
*
|
||||
* API signature on CUDA 13.2:
|
||||
* cuTensorMapEncodeTiled(CUtensorMap*, cuuint32_t rank,
|
||||
* CUtensorMapDataType, void*,
|
||||
* cuuint64_t* tensorDims, cuuint64_t* globalStrides,
|
||||
* cuuint32_t* boxDims, cuuint32_t* elementStrides,
|
||||
* CUtensorMapInterleave, CUtensorMapSwizzle, CUtensorMapL2promotion)
|
||||
*
|
||||
* Note: NO OOB fill parameter on CUDA 13.2 (10 params, not 11).
|
||||
* Signature: cuTensorMapEncodeTiled(
|
||||
* CUtensorMap*, CUtensorMapDataType, cuuint32_t tensorRank,
|
||||
* void*, cuuint64_t* globalDim, cuuint64_t* globalStrides,
|
||||
* cuuint32_t* boxDim, cuuint32_t* elementStrides,
|
||||
* CUtensorMapInterleave, CUtensorMapSwizzle,
|
||||
* CUtensorMapL2promotion, CUtensorMapFloatOOBfill)
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
int main() {
|
||||
const int ROWS = 128;
|
||||
@@ -26,15 +23,19 @@ int main() {
|
||||
cudaMalloc(&d_ptr, SIZE);
|
||||
cudaMemset(d_ptr, 0, SIZE);
|
||||
|
||||
cuuint64_t tensorDims[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS};
|
||||
// globalDim: tensor dimensions (ROWS, COLS) in elements
|
||||
cuuint64_t globalDim[] = {(cuuint64_t)ROWS, (cuuint64_t)COLS};
|
||||
// globalStrides: byte strides between rows and between elements
|
||||
cuuint64_t globalStrides[] = {(cuuint64_t)(COLS * 2), (cuuint64_t)2};
|
||||
cuuint32_t boxDims[] = {16, 16};
|
||||
// boxDim: TMA tile dimensions (16, 16)
|
||||
cuuint32_t boxDim[] = {16, 16};
|
||||
// elementStrides: (1, 1) = contiguous
|
||||
cuuint32_t elementStrides[] = {1, 1};
|
||||
|
||||
CUtensorMap tma_desc;
|
||||
CUresult res;
|
||||
|
||||
auto dump_desc = [](const char* label, const CUtensorMap& desc) {
|
||||
auto dump = [](const char* label, const CUtensorMap& desc) {
|
||||
printf("=== %s ===\n", label);
|
||||
auto* b = reinterpret_cast<const uint8_t*>(&desc);
|
||||
for (int i = 0; i < 128; i += 16) {
|
||||
@@ -44,21 +45,41 @@ int main() {
|
||||
}
|
||||
};
|
||||
|
||||
// 1: NO swizzle
|
||||
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
|
||||
// 1: NO swizzle, OOB_NONE
|
||||
res = cuTensorMapEncodeTiled(&tma_desc,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
||||
d_ptr, globalDim, globalStrides, boxDim, elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE);
|
||||
if (res == CUDA_SUCCESS) dump_desc("NO swizzle", tma_desc);
|
||||
else printf("=== NO swizzle: FAILED (%d) ===\n", res);
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
if (res == CUDA_SUCCESS) dump("NO swizzle, OOB_NONE", tma_desc);
|
||||
else printf("=== NO swizzle, OOB_NONE: FAILED (%d) ===\n", res);
|
||||
|
||||
// 2: SWIZZLE_128B
|
||||
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
|
||||
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
|
||||
// 2: SWIZZLE_128B, OOB_NONE
|
||||
res = cuTensorMapEncodeTiled(&tma_desc,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
||||
d_ptr, globalDim, globalStrides, boxDim, elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE);
|
||||
if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B", tma_desc);
|
||||
else printf("=== SWIZZLE_128B: FAILED (%d) ===\n", res);
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
|
||||
if (res == CUDA_SUCCESS) dump("SWIZZLE_128B, OOB_NONE", tma_desc);
|
||||
else printf("=== SWIZZLE_128B, OOB_NONE: FAILED (%d) ===\n", res);
|
||||
|
||||
// 3: NO swizzle, OOB_FILL_ZERO
|
||||
res = cuTensorMapEncodeTiled(&tma_desc,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
||||
d_ptr, globalDim, globalStrides, boxDim, elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_ZERO);
|
||||
if (res == CUDA_SUCCESS) dump("NO swizzle, OOB_FILL_ZERO", tma_desc);
|
||||
else printf("=== NO swizzle, OOB_FILL_ZERO: FAILED (%d) ===\n", res);
|
||||
|
||||
// 4: SWIZZLE_128B, OOB_FILL_ZERO
|
||||
res = cuTensorMapEncodeTiled(&tma_desc,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
||||
d_ptr, globalDim, globalStrides, boxDim, elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_FLOAT_OOB_FILL_ZERO);
|
||||
if (res == CUDA_SUCCESS) dump("SWIZZLE_128B, OOB_FILL_ZERO", tma_desc);
|
||||
else printf("=== SWIZZLE_128B, OOB_FILL_ZERO: FAILED (%d) ===\n", res);
|
||||
|
||||
cudaFree(d_ptr);
|
||||
printf("\nPASSED\n");
|
||||
|
||||
Reference in New Issue
Block a user