diff --git a/tests/unit/test_fmha_6warp_multirow.cu b/tests/unit/test_fmha_6warp_multirow.cu index 834cf8db..68b3e1ed 100644 --- a/tests/unit/test_fmha_6warp_multirow.cu +++ b/tests/unit/test_fmha_6warp_multirow.cu @@ -70,6 +70,8 @@ static int test_single_T(int T, int n_h = 1, int batch = 1) { printf("\n=== Test T=%d, n_h=%d, batch=%d, HD=%d, SK=%d (%s) ===\n", T, n_h, batch, HD, SK, mode); const float SCALE = 1.0f / sqrtf((float)HD); int pass = 1; + int checked = 0, failed = 0; + float min_cos = 1.0f; int total_heads = batch * n_h; @@ -142,10 +144,6 @@ static int test_single_T(int T, int n_h = 1, int batch = 1) { pass = 0; goto cleanup; } - // Verify each head (declared before goto targets) - int checked = 0, failed = 0; - float min_cos = 1.0f; - cudaMemcpy(h_o, d_o, total_heads * T * HD * sizeof(bf16_t), cudaMemcpyDeviceToHost); cudaMemcpy(h_lse, d_lse, total_heads * T * sizeof(float), cudaMemcpyDeviceToHost); for (int b = 0; b < batch; b++) {