diff --git a/tests/unit/test_mma_ts.cu b/tests/unit/test_mma_ts.cu index 94037723..4592a41d 100644 --- a/tests/unit/test_mma_ts.cu +++ b/tests/unit/test_mma_ts.cu @@ -21,7 +21,7 @@ using namespace dsv4::kernels::attention; constexpr int BLOCK_MN = 128; __global__ void __launch_bounds__(128) -test_mma_ts(float* o_out) +test_mma_ts() { const int tid = threadIdx.x, wid = tid / 32, lane = tid % 32; @@ -117,16 +117,12 @@ test_mma_ts(float* o_out) int main() { printf("=== Minimal tcgen05.mma TS Test ===\n"); - float* d_out; - cudaMalloc(&d_out, 16 * sizeof(float)); - int smem = (4 + 16 + 16*16*2 + 256 + 127) & ~127; - test_mma_ts<<<1, 128, smem>>>(d_out); + test_mma_ts<<<1, 128, smem>>>(); cudaError_t err = cudaDeviceSynchronize(); if (err != cudaSuccess) { printf("CUDA ERROR: %s\n", cudaGetErrorString(err)); return 1; } - printf("Test completed successfully!\n"); - cudaFree(d_out); + printf("Kernel completed!\n"); return 0; }