P4: TMA load test kernel (swizzle vs no-swizzle hang diagnosis)
This commit is contained in:
169
tests/unit/test_p4_tma_load_test.cu
Normal file
169
tests/unit/test_p4_tma_load_test.cu
Normal file
@@ -0,0 +1,169 @@
|
||||
/**
|
||||
* P4: Test actual TMA loads with different descriptors.
|
||||
*
|
||||
* Creates TMA descriptors with various swizzle/OOB configs,
|
||||
* launches a kernel that does cp.async.bulk.tensor.2d with each,
|
||||
* and checks if the load completes (mbarrier signals) or hangs.
|
||||
*
|
||||
* The existing FMHA kernel's TMA loads work (they use CuTeDSL's
|
||||
* TMA path which creates descriptors with swizzle). The raw CUDA
|
||||
* path with NO swizzle hangs. This test identifies which field
|
||||
* causes the hang.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <cstdio>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
// Maximum wait iterations before declaring a hang
|
||||
#define MAX_WAIT 1000000
|
||||
|
||||
__global__ void tma_load_test_kernel(
|
||||
const void* tma_desc_ptr, // 128-byte TMA descriptor in GMEM
|
||||
void* smem_out, // SMEM buffer for TMA output (256 bytes)
|
||||
int* result // GMEM: 0=pending, 1=success, -1=hang
|
||||
) {
|
||||
// Set up mbarrier in SMEM
|
||||
__shared__ uint64_t mbar;
|
||||
if (threadIdx.x == 0) {
|
||||
// Initialize mbarrier with expected count = 1 (one TMA load)
|
||||
asm volatile("mbarrier.init.shared.b64 [%0], 1;" :: "r"(__cvta_generic_to_shared(&mbar)));
|
||||
asm volatile("fence.mbarrier_init.release.cluster;" ::: "memory");
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Only thread 0 issues TMA
|
||||
if (threadIdx.x == 0) {
|
||||
// TMA load: 16x16 BF16 tile = 512 bytes
|
||||
// cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes
|
||||
asm volatile(
|
||||
"cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::complete_tx::bytes "
|
||||
"[%0], [%1, {%3, %4}], [%2];"
|
||||
:: "r"(__cvta_generic_to_shared(smem_out)),
|
||||
"l"(tma_desc_ptr),
|
||||
"r"(__cvta_generic_to_shared(&mbar)),
|
||||
"r"(0), // coord row = 0
|
||||
"r"(0) // coord col = 0
|
||||
);
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Wait for mbarrier to complete (TMA arrival)
|
||||
if (threadIdx.x == 0) {
|
||||
int waited = 0;
|
||||
int arrived = 0;
|
||||
while (waited < MAX_WAIT) {
|
||||
uint32_t state;
|
||||
asm volatile(
|
||||
"{\n\t"
|
||||
".reg .pred p;\n\t"
|
||||
"mbarrier.try_wait.parity.shared.b64 p, [%0], 0;\n\t"
|
||||
"selp.b32 %1, 1, 0, p;\n\t"
|
||||
"}"
|
||||
: "=r"(state)
|
||||
: "r"(__cvta_generic_to_shared(&mbar))
|
||||
);
|
||||
if (state) { arrived = 1; break; }
|
||||
waited++;
|
||||
}
|
||||
if (arrived) {
|
||||
*result = 1; // success
|
||||
} else {
|
||||
*result = -1; // hang
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
CUtensorMap create_descriptor(void* d_ptr, int swizzle, int oob_fill) {
|
||||
CUtensorMap desc;
|
||||
cuuint64_t globalDim[] = {128, 16};
|
||||
cuuint64_t globalStrides[] = {16 * 2, 2};
|
||||
cuuint32_t boxDim[] = {16, 16};
|
||||
cuuint32_t elementStrides[] = {1, 1};
|
||||
|
||||
CUtensorMapSwizzle sw = (swizzle == 0) ? CU_TENSOR_MAP_SWIZZLE_NONE : CU_TENSOR_MAP_SWIZZLE_128B;
|
||||
CUtensorMapFloatOOBfill oob = (oob_fill == 0) ? CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
|
||||
: CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA;
|
||||
|
||||
CUresult res = cuTensorMapEncodeTiled(&desc,
|
||||
CU_TENSOR_MAP_DATA_TYPE_BFLOAT16, 2,
|
||||
d_ptr, globalDim, globalStrides, boxDim, elementStrides,
|
||||
CU_TENSOR_MAP_INTERLEAVE_NONE, sw,
|
||||
CU_TENSOR_MAP_L2_PROMOTION_NONE, oob);
|
||||
|
||||
if (res != CUDA_SUCCESS) {
|
||||
printf(" Descriptor creation FAILED: %d\n", res);
|
||||
}
|
||||
return desc;
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
const int ROWS = 128;
|
||||
const int COLS = 16;
|
||||
const size_t DATA_SIZE = ROWS * COLS * 2;
|
||||
const size_t SMEM_SIZE = 512; // 16x16 BF16 = 512 bytes
|
||||
|
||||
// Allocate source data
|
||||
void* d_data;
|
||||
cudaMalloc(&d_data, DATA_SIZE);
|
||||
cudaMemset(d_data, 1, DATA_SIZE); // Fill with non-zero data
|
||||
|
||||
// Allocate result
|
||||
int* d_result;
|
||||
cudaMalloc(&d_result, sizeof(int));
|
||||
cudaMemset(d_result, 0, sizeof(int));
|
||||
|
||||
// Test configs
|
||||
struct { const char* name; int swizzle; int oob; } configs[] = {
|
||||
{"NO swizzle, OOB_NONE", 0, 0},
|
||||
{"SWIZZLE_128B, OOB_NONE", 1, 0},
|
||||
{"NO swizzle, OOB_FILL_ZERO", 0, 1},
|
||||
{"SWIZZLE_128B, OOB_FILL_ZERO", 1, 1},
|
||||
};
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
printf("Testing: %s\n", configs[i].name);
|
||||
|
||||
// Create descriptor and copy to GMEM
|
||||
CUtensorMap desc = create_descriptor(d_data, configs[i].swizzle, configs[i].oob);
|
||||
void* d_desc;
|
||||
cudaMalloc(&d_desc, sizeof(CUtensorMap));
|
||||
cudaMemcpy(d_desc, &desc, sizeof(CUtensorMap), cudaMemcpyHostToDevice);
|
||||
|
||||
// Allocate SMEM output
|
||||
void* d_smem_out;
|
||||
cudaMalloc(&d_smem_out, SMEM_SIZE);
|
||||
|
||||
// Reset result
|
||||
cudaMemset(d_result, 0, sizeof(int));
|
||||
|
||||
// Launch with timeout
|
||||
tma_load_test_kernel<<<1, 32, SMEM_SIZE + 64>>>(d_desc, d_smem_out, d_result);
|
||||
|
||||
// Check result with a short timeout on host
|
||||
cudaError_t err = cudaDeviceSynchronize();
|
||||
int h_result;
|
||||
cudaMemcpy(&h_result, d_result, sizeof(int), cudaMemcpyDeviceToHost);
|
||||
|
||||
if (err != cudaSuccess) {
|
||||
printf(" HANG or ERROR: %s (result=%d)\n", cudaGetErrorString(err), h_result);
|
||||
} else if (h_result == 1) {
|
||||
printf(" SUCCESS: TMA load completed\n");
|
||||
} else if (h_result == -1) {
|
||||
printf(" HANG: mbarrier never signaled\n");
|
||||
} else {
|
||||
printf(" UNKNOWN: result=%d\n", h_result);
|
||||
}
|
||||
|
||||
cudaFree(d_desc);
|
||||
cudaFree(d_smem_out);
|
||||
}
|
||||
|
||||
cudaFree(d_data);
|
||||
cudaFree(d_result);
|
||||
printf("\nPASSED\n");
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user