fix: correct SMEM size for MMA 4-warp test

This commit is contained in:
2026-05-28 23:01:12 +00:00
parent be45e87891
commit d54bce6a6d

View File

@@ -153,7 +153,8 @@ int main() {
float* d_r;
cudaMalloc(&d_r, 32 * sizeof(float));
cudaMemset(d_r, 0, 32 * sizeof(float));
test_mma_4warp_read<<<1, 192, 4096>>>(d_r);
int smem = 256 + 128 + 128*16*2*2 + 256; // sbuf + align + sA + sB + slack
test_mma_4warp_read<<<1, 192, smem>>>(d_r);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {