From bf4dfd131be39a7fe564257b580e5d694fdda737 Mon Sep 17 00:00:00 2001
From: biondizzle <biondizzle@gmail.com>
Date: Thu, 28 May 2026 20:04:59 +0000
Subject: [PATCH] Fix nvcc goto-bypasses-init: move var decls before goto
 targets

---
 tests/unit/test_fmha_6warp_multirow.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/unit/test_fmha_6warp_multirow.cu b/tests/unit/test_fmha_6warp_multirow.cu
index 834cf8db..68b3e1ed 100644
--- a/tests/unit/test_fmha_6warp_multirow.cu
+++ b/tests/unit/test_fmha_6warp_multirow.cu
@@ -70,6 +70,8 @@ static int test_single_T(int T, int n_h = 1, int batch = 1) {
     printf("\n=== Test T=%d, n_h=%d, batch=%d, HD=%d, SK=%d (%s) ===\n", T, n_h, batch, HD, SK, mode);
     const float SCALE = 1.0f / sqrtf((float)HD);
     int pass = 1;
+    int checked = 0, failed = 0;
+    float min_cos = 1.0f;
 
     int total_heads = batch * n_h;
 
@@ -142,10 +144,6 @@ static int test_single_T(int T, int n_h = 1, int batch = 1) {
         pass = 0; goto cleanup;
     }
 
-    // Verify each head (declared before goto targets)
-    int checked = 0, failed = 0;
-    float min_cos = 1.0f;
-
     cudaMemcpy(h_o, d_o, total_heads * T * HD * sizeof(bf16_t), cudaMemcpyDeviceToHost);
     cudaMemcpy(h_lse, d_lse, total_heads * T * sizeof(float), cudaMemcpyDeviceToHost);
     for (int b = 0; b < batch; b++) {