diff --git a/tests/unit/test_umma_qk_hd64.cu b/tests/unit/test_umma_qk_hd64.cu index 783a5768..8b3663e0 100644 --- a/tests/unit/test_umma_qk_hd64.cu +++ b/tests/unit/test_umma_qk_hd64.cu @@ -97,7 +97,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k, uint32_t sK_smem = __cvta_generic_to_shared(sK); uint32_t idesc = make_idesc(BLOCK_MN, BLOCK_MN); - for (int kt = 0; kt < NKT; kt++) { + for (int kt = 0; kt < 1; kt++) { // DEBUG: single K-tile from full SMEM // K-tile offset in canonical layout: // Each 16-BF16 K-tile spans 2 core columns. // Core column 2*kt starts at offset 2*kt * (128/8 * 128) bytes = 2*kt * 2048 bytes = kt * 4096 bytes. @@ -144,7 +144,7 @@ test_umma_hd64(const bf16_t* __restrict__ q, const bf16_t* __restrict__ k, if (tid == 0) { for (int j = 0; j < SK; j++) { float dot = 0.0f; - for (int d = 0; d < HD; d++) + for (int d = 0; d < 16; d++) // DEBUG: single K-tile dot += bf16_to_f32(q[d]) * bf16_to_f32(k[j * HD + d]); s_scalar[j] = dot * scale; }