diff --git a/tests/unit/test_tmem_4warp_read.cu b/tests/unit/test_tmem_4warp_read.cu index a3f26922..e32ee1d4 100644 --- a/tests/unit/test_tmem_4warp_read.cu +++ b/tests/unit/test_tmem_4warp_read.cu @@ -153,7 +153,8 @@ int main() { float* d_r; cudaMalloc(&d_r, 32 * sizeof(float)); cudaMemset(d_r, 0, 32 * sizeof(float)); - test_mma_4warp_read<<<1, 192, 4096>>>(d_r); + int smem = 256 + 128 + 128*16*2*2 + 256; // sbuf + align + sA + sB + slack + test_mma_4warp_read<<<1, 192, smem>>>(d_r); cudaError_t err = cudaDeviceSynchronize(); if (err != cudaSuccess) {