diff --git a/tests/unit/test_umma_qk_hd64.cu b/tests/unit/test_umma_qk_hd64.cu index 6a671f32..76d88851 100644 --- a/tests/unit/test_umma_qk_hd64.cu +++ b/tests/unit/test_umma_qk_hd64.cu @@ -29,7 +29,7 @@ test_umma_qk_hd64(const bf16_t* q, const bf16_t* k, { const int tid = threadIdx.x; const int wid = tid / 32, lane = tid % 32; - const int n_ktiles = hd / MMA_K; // 4 for hd=64 + const int n_ktiles = 1; // hd / MMA_K; // Force 1 K-tile for debugging // SMEM: sQ (128, HD) canonical + sK (128, HD) canonical // Each K-tile of (128, 16) = 4096 bytes