P4: CUDA 13.2 has 10-param cuTensorMapEncodeTiled (no OOB fill)

This commit is contained in:
2026-05-30 08:35:34 +00:00
parent d8ffdb66e1
commit 8df3ccecea

View File

@@ -1,6 +1,15 @@
/**
* P4: Dump TMA descriptor bytes for comparison.
* CUDA 13.2 compatible — uses correct API signature.
* CUDA 13.2 compatible.
*
* API signature on CUDA 13.2:
* cuTensorMapEncodeTiled(CUtensorMap*, cuuint32_t rank,
* CUtensorMapDataType, void*,
* cuuint64_t* tensorDims, cuuint64_t* globalStrides,
* cuuint32_t* boxDims, cuuint32_t* elementStrides,
* CUtensorMapInterleave, CUtensorMapSwizzle, CUtensorMapL2promotion)
*
* Note: NO OOB fill parameter on CUDA 13.2 (10 params, not 11).
*/
#include <cuda.h>
#include <cuda_runtime.h>
@@ -8,33 +17,6 @@
#include <cstdint>
#include <cstring>
/* CUDA 13.2 cuTensorMapEncodeTiled signature:
* CUresult cuTensorMapEncodeTiled(
* CUtensorMap *tensorMap,
* cuuint32_t tensorRank,
* CUtensorMapDataType dataType,
* void *globalAddress,
* const cuuint64_t *tensorDims,
* const cuuint64_t *globalStrides,
* const cuuint32_t *boxDims,
* const cuuint32_t *elementStrides,
* CUtensorMapInterleave interleave,
* CUtensorMapSwizzle swizzle,
* CUtensorMapL2promotion l2Promotion,
* CUtensorMapOOBfill oobFill
* );
*
* Note: OOB fill is CUtensorMapOOBfill (lowercase f) in CUDA 13.2
*/
// Define missing enum values if needed
#ifndef CU_TENSOR_MAP_OOB_FILL_NONE
#define CU_TENSOR_MAP_OOB_FILL_NONE ((CUtensorMapOOBfill)0)
#endif
#ifndef CU_TENSOR_MAP_OOB_FILL_ZERO
#define CU_TENSOR_MAP_OOB_FILL_ZERO ((CUtensorMapOOBfill)1)
#endif
int main() {
const int ROWS = 128;
const int COLS = 16;
@@ -66,7 +48,7 @@ int main() {
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE);
CU_TENSOR_MAP_L2_PROMOTION_NONE);
if (res == CUDA_SUCCESS) dump_desc("NO swizzle", tma_desc);
else printf("=== NO swizzle: FAILED (%d) ===\n", res);
@@ -74,26 +56,10 @@ int main() {
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_NONE);
CU_TENSOR_MAP_L2_PROMOTION_NONE);
if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B", tma_desc);
else printf("=== SWIZZLE_128B: FAILED (%d) ===\n", res);
// 3: NO swizzle, OOB_FILL_ZERO
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_NONE,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO);
if (res == CUDA_SUCCESS) dump_desc("NO swizzle + OOB_FILL_ZERO", tma_desc);
else printf("=== NO swizzle + OOB_FILL_ZERO: FAILED (%d) ===\n", res);
// 4: SWIZZLE_128B, OOB_FILL_ZERO
res = cuTensorMapEncodeTiled(&tma_desc, 2, CU_TENSOR_MAP_DATA_TYPE_BFLOAT16,
d_ptr, tensorDims, globalStrides, boxDims, elementStrides,
CU_TENSOR_MAP_INTERLEAVE_NONE, CU_TENSOR_MAP_SWIZZLE_128B,
CU_TENSOR_MAP_L2_PROMOTION_NONE, CU_TENSOR_MAP_OOB_FILL_ZERO);
if (res == CUDA_SUCCESS) dump_desc("SWIZZLE_128B + OOB_FILL_ZERO", tma_desc);
else printf("=== SWIZZLE_128B + OOB_FILL_ZERO: FAILED (%d) ===\n", res);
cudaFree(d_ptr);
printf("\nPASSED\n");
return 0;