fix: correct SMEM size for MMA 4-warp test
This commit is contained in:
@@ -153,7 +153,8 @@ int main() {
|
||||
float* d_r;
|
||||
cudaMalloc(&d_r, 32 * sizeof(float));
|
||||
cudaMemset(d_r, 0, 32 * sizeof(float));
|
||||
test_mma_4warp_read<<<1, 192, 4096>>>(d_r);
|
||||
int smem = 256 + 128 + 128*16*2*2 + 256; // sbuf + align + sA + sB + slack
|
||||
test_mma_4warp_read<<<1, 192, smem>>>(d_r);
|
||||
|
||||
cudaError_t err = cudaDeviceSynchronize();
|
||||
if (err != cudaSuccess) {
|
||||
|
||||
Reference in New Issue
Block a user